623 files changed, 20632 insertions, 7399 deletions
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index ea70b93..3f2eb3a 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -58,7 +58,6 @@ jobs:
       ref: ${{ steps.vars.outputs.ref }}
       upload: ${{ steps.vars.outputs.upload }}
       target-cmake-flags: ${{ steps.vars.outputs.target-cmake-flags }}
-      ccache: ${{ steps.vars.outputs.ccache }}
       build-flang: ${{ steps.vars.outputs.build-flang }}
       release-binary-basename: ${{ steps.vars.outputs.release-binary-basename }}
       release-binary-filename: ${{ steps.vars.outputs.release-binary-filename }}
@@ -123,13 +122,6 @@ jobs:
         echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT
 
         target="$RUNNER_OS-$RUNNER_ARCH"
-        # The hendrikmuhs/ccache-action action does not support installing sccache
-        # on arm64 Linux.
-        if [ "$target" = "Linux-ARM64" ]; then
-          echo ccache=ccache >> $GITHUB_OUTPUT
-        else
-          echo ccache=sccache >> $GITHUB_OUTPUT
-        fi
 
         # The macOS builds try to cross compile some libraries so we need to
         # add extra CMake args to disable them.
@@ -222,8 +214,6 @@ jobs:
     - name: Configure
       id: build
       shell: bash
-      env:
-        CCACHE_BIN: ${{ needs.prepare.outputs.ccache }}
       run: |
         # There were some issues on the ARM64 MacOS runners with trying to build x86 object,
         # so we need to set some extra cmake flags to disable this.
diff --git a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
index e825547..799e02f 100644
--- a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
+++ b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
@@ -90,6 +90,7 @@ bool IncludeFixerActionFactory::runInvocation(
 
   // Set up Clang.
   CompilerInstance Compiler(std::move(Invocation), std::move(PCHContainerOps));
+  Compiler.setVirtualFileSystem(Files->getVirtualFileSystemPtr());
   Compiler.setFileManager(Files);
 
   // Create the compiler's actual diagnostics engine. We want to drop all
diff --git a/clang/docs/AllocToken.rst b/clang/docs/AllocToken.rst
index bda8466..b65e18c 100644
--- a/clang/docs/AllocToken.rst
+++ b/clang/docs/AllocToken.rst
@@ -37,8 +37,8 @@ The default mode to calculate tokens is:
   pointers.
 
 Other token ID assignment modes are supported, but they may be subject to
-change or removal. These may (experimentally) be selected with ``-mllvm
--alloc-token-mode=<mode>``:
+change or removal. These may (experimentally) be selected with ``-Xclang
+-falloc-token-mode=<mode>``:
 
 * ``typehash``: This mode assigns a token ID based on the hash of the allocated
   type's name.
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index c75c170..61b5bab 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -256,7 +256,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | device-specific environment variables                        | :none:`unclaimed`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | omp_target_is_accessible routine                             | :part:`In Progress`      | https://github.com/llvm/llvm-project/pull/138294                      |
+| device                       | omp_target_is_accessible routine                             | :good:`done`             | https://github.com/llvm/llvm-project/pull/138294                      |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | omp_get_mapped_ptr routine                                   | :good:`done`             | D141545                                                               |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index fe77f91..e6e33e7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -447,6 +447,7 @@ Bug Fixes to Attribute Support
 - Using ``[[gnu::cleanup(some_func)]]`` where some_func is annotated with
   ``[[gnu::error("some error")]]`` now correctly triggers an error. (#GH146520)
 - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075)
+- Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905)
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index 8b4554e..4d52805 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -815,6 +815,17 @@ public:
   Stmt *getAssociatedStmt() {
     return OpenACCAssociatedStmtConstruct::getAssociatedStmt();
   }
+
+  // A struct to represent a broken-down version of the associated statement,
+  // providing the information specified in OpenACC3.3 Section 2.12.
+  struct StmtInfo {
+    const Expr *V;
+    const Expr *X;
+    // TODO: OpenACC: We should expand this as we're implementing the other
+    // atomic construct kinds.
+  };
+
+  const StmtInfo getAssociatedStmtInfo() const;
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index eb48a0c..b320f4b 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1623,7 +1623,7 @@ def SYCLKernel : InheritableAttr {
   let Documentation = [SYCLKernelDocs];
 }
 
-def DeviceKernel : DeclOrTypeAttr {
+def DeviceKernel : InheritableAttr {
   let Spellings = [Clang<"device_kernel">,
                    Clang<"nvptx_kernel">, Clang<"amdgpu_kernel">,
                    CustomKeyword<"__kernel">, CustomKeyword<"kernel">];
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index cae06c3..5d5cf25 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -447,10 +447,6 @@ public:
 
   std::optional<double> AllowRuntimeCheckSkipHotCutoff;
 
-  /// Maximum number of allocation tokens (0 = no max), nullopt if none set (use
-  /// pass default).
-  std::optional<uint64_t> AllocTokenMax;
-
   /// List of backend command-line options for -fembed-bitcode.
   std::vector<uint8_t> CmdArgs;
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5ff4cc4..20b4994 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4126,6 +4126,9 @@ def warn_missing_sdksettings_for_availability_checking : Warning<
   "%0 availability is ignored without a valid 'SDKSettings.json' in the SDK">,
   InGroup<DiagGroup<"ignored-availability-without-sdk-settings">>;
 
+def err_hidden_device_kernel
+    : Error<"%0 is specified as a device kernel but it is not externally visible">;
+
 // Thread Safety Attributes
 def warn_thread_attribute_ignored : Warning<
   "ignoring %0 attribute because its argument is invalid">,
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 260a753..8aa89d8 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/Support/AllocToken.h"
 #include "llvm/TargetParser/Triple.h"
 #include <optional>
 #include <string>
@@ -565,6 +566,13 @@ public:
   bool AtomicFineGrainedMemory = false;
   bool AtomicIgnoreDenormalMode = false;
 
+  /// Maximum number of allocation tokens (0 = no max), nullopt if none set (use
+  /// target default).
+  std::optional<uint64_t> AllocTokenMax;
+
+  /// The allocation token mode.
+  std::optional<llvm::AllocTokenMode> AllocTokenMode;
+
   LangOptions();
 
   /// Set language defaults for the given input language and
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 7ae153d..0c9584f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2751,6 +2751,10 @@ def falloc_token_max_EQ : Joined<["-"], "falloc-token-max=">,
   MetaVarName<"<N>">,
   HelpText<"Limit to maximum N allocation tokens (0 = no max)">;
 
+def falloc_token_mode_EQ : Joined<["-"], "falloc-token-mode=">,
+  Group<f_Group>, Visibility<[CC1Option]>,
+  HelpText<"Set the allocation token mode (experimental)">;
+
 def fallow_runtime_check_skip_hot_cutoff_EQ
     : Joined<["-"], "fallow-runtime-check-skip-hot-cutoff=">,
       Group<f_clang_Group>,
diff --git a/clang/include/clang/Frontend/ASTUnit.h b/clang/include/clang/Frontend/ASTUnit.h
index f66df89..3cea159 100644
--- a/clang/include/clang/Frontend/ASTUnit.h
+++ b/clang/include/clang/Frontend/ASTUnit.h
@@ -499,6 +499,11 @@ public:
     return *PPOpts;
   }
 
+  IntrusiveRefCntPtr<llvm::vfs::FileSystem> getVirtualFileSystemPtr() {
+    // FIXME: Don't defer VFS ownership to the FileManager.
+    return FileMgr->getVirtualFileSystemPtr();
+  }
+
   const FileManager &getFileManager() const { return *FileMgr; }
   FileManager &getFileManager() { return *FileMgr; }
   IntrusiveRefCntPtr<FileManager> getFileManagerPtr() { return FileMgr; }
diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h
index 44fff69..2403cbb 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -460,7 +460,7 @@ public:
     FileMgr.resetWithoutRelease();
   }
 
-  /// Replace the current file manager and virtual file system.
+  /// Replace the current file manager.
   void setFileManager(IntrusiveRefCntPtr<FileManager> Value);
 
   /// @}
diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp
index 9ea104c..fd12bc4 100644
--- a/clang/lib/AST/ASTConcept.cpp
+++ b/clang/lib/AST/ASTConcept.cpp
@@ -86,7 +86,7 @@ void ConstraintSatisfaction::Profile(llvm::FoldingSetNodeID &ID,
   ID.AddPointer(ConstraintOwner);
   ID.AddInteger(TemplateArgs.size());
   for (auto &Arg : TemplateArgs)
-    C.getCanonicalTemplateArgument(Arg).Profile(ID, C);
+    Arg.Profile(ID, C);
 }
 
 ConceptReference *
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 6b98927..f15b3c1 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -6432,6 +6432,13 @@ bool Compiler<Emitter>::visitFunc(const FunctionDecl *F) {
   return this->emitNoRet(SourceInfo{});
 }
 
+static uint32_t getBitWidth(const Expr *E) {
+  assert(E->refersToBitField());
+  const auto *ME = cast<MemberExpr>(E);
+  const auto *FD = cast<FieldDecl>(ME->getMemberDecl());
+  return FD->getBitWidthValue();
+}
+
 template <class Emitter>
 bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
   const Expr *SubExpr = E->getSubExpr();
@@ -6460,10 +6467,15 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       return DiscardResult ? this->emitPopPtr(E) : true;
     }
 
-    if (T == PT_Float) {
+    if (T == PT_Float)
       return DiscardResult ? this->emitIncfPop(getFPOptions(E), E)
                            : this->emitIncf(getFPOptions(E), E);
-    }
+
+    if (SubExpr->refersToBitField())
+      return DiscardResult ? this->emitIncPopBitfield(*T, E->canOverflow(),
+                                                      getBitWidth(SubExpr), E)
+                           : this->emitIncBitfield(*T, E->canOverflow(),
+                                                   getBitWidth(SubExpr), E);
 
     return DiscardResult ? this->emitIncPop(*T, E->canOverflow(), E)
                          : this->emitInc(*T, E->canOverflow(), E);
@@ -6484,9 +6496,15 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       return DiscardResult ? this->emitPopPtr(E) : true;
     }
 
-    if (T == PT_Float) {
+    if (T == PT_Float)
       return DiscardResult ? this->emitDecfPop(getFPOptions(E), E)
                            : this->emitDecf(getFPOptions(E), E);
+
+    if (SubExpr->refersToBitField()) {
+      return DiscardResult ? this->emitDecPopBitfield(*T, E->canOverflow(),
+                                                      getBitWidth(SubExpr), E)
+                           : this->emitDecBitfield(*T, E->canOverflow(),
+                                                   getBitWidth(SubExpr), E);
     }
 
     return DiscardResult ? this->emitDecPop(*T, E->canOverflow(), E)
@@ -6515,6 +6533,11 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     if (DiscardResult) {
       if (T == PT_Float)
         return this->emitIncfPop(getFPOptions(E), E);
+      if (SubExpr->refersToBitField())
+        return DiscardResult ? this->emitIncPopBitfield(*T, E->canOverflow(),
+                                                        getBitWidth(SubExpr), E)
+                             : this->emitIncBitfield(*T, E->canOverflow(),
+                                                     getBitWidth(SubExpr), E);
       return this->emitIncPop(*T, E->canOverflow(), E);
     }
 
@@ -6530,6 +6553,11 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
         return false;
       if (!this->emitStoreFloat(E))
         return false;
+    } else if (SubExpr->refersToBitField()) {
+      assert(isIntegralType(*T));
+      if (!this->emitPreIncBitfield(*T, E->canOverflow(), getBitWidth(SubExpr),
+                                    E))
+        return false;
     } else {
       assert(isIntegralType(*T));
       if (!this->emitPreInc(*T, E->canOverflow(), E))
@@ -6560,6 +6588,11 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     if (DiscardResult) {
       if (T == PT_Float)
         return this->emitDecfPop(getFPOptions(E), E);
+      if (SubExpr->refersToBitField())
+        return DiscardResult ? this->emitDecPopBitfield(*T, E->canOverflow(),
+                                                        getBitWidth(SubExpr), E)
+                             : this->emitDecBitfield(*T, E->canOverflow(),
+                                                     getBitWidth(SubExpr), E);
       return this->emitDecPop(*T, E->canOverflow(), E);
     }
 
@@ -6575,6 +6608,11 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
         return false;
       if (!this->emitStoreFloat(E))
         return false;
+    } else if (SubExpr->refersToBitField()) {
+      assert(isIntegralType(*T));
+      if (!this->emitPreDecBitfield(*T, E->canOverflow(), getBitWidth(SubExpr),
+                                    E))
+        return false;
     } else {
       assert(isIntegralType(*T));
       if (!this->emitPreDec(*T, E->canOverflow(), E))
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index d8529da..89f6fbe 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -702,7 +702,7 @@ enum class IncDecOp {
 
 template <typename T, IncDecOp Op, PushVal DoPush>
 bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                  bool CanOverflow) {
+                  bool CanOverflow, UnsignedOrNone BitWidth = std::nullopt) {
   assert(!Ptr.isDummy());
 
   if (!S.inConstantContext()) {
@@ -725,12 +725,18 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
   if constexpr (Op == IncDecOp::Inc) {
     if (!T::increment(Value, &Result) || !CanOverflow) {
-      Ptr.deref<T>() = Result;
+      if (BitWidth)
+        Ptr.deref<T>() = Result.truncate(*BitWidth);
+      else
+        Ptr.deref<T>() = Result;
       return true;
     }
   } else {
     if (!T::decrement(Value, &Result) || !CanOverflow) {
-      Ptr.deref<T>() = Result;
+      if (BitWidth)
+        Ptr.deref<T>() = Result.truncate(*BitWidth);
+      else
+        Ptr.deref<T>() = Result;
       return true;
     }
   }
@@ -774,6 +780,17 @@ bool Inc(InterpState &S, CodePtr OpPC, bool CanOverflow) {
                                                       CanOverflow);
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool IncBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                 unsigned BitWidth) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
+    return false;
+
+  return IncDecHelper<T, IncDecOp::Inc, PushVal::Yes>(S, OpPC, Ptr, CanOverflow,
+                                                      BitWidth);
+}
+
 /// 1) Pops a pointer from the stack
 /// 2) Load the value from the pointer
 /// 3) Writes the value increased by one back to the pointer
@@ -787,6 +804,17 @@ bool IncPop(InterpState &S, CodePtr OpPC, bool CanOverflow) {
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool IncPopBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                    uint32_t BitWidth) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
+    return false;
+
+  return IncDecHelper<T, IncDecOp::Inc, PushVal::No>(S, OpPC, Ptr, CanOverflow,
+                                                     BitWidth);
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool PreInc(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
@@ -795,6 +823,17 @@ bool PreInc(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   return IncDecHelper<T, IncDecOp::Inc, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool PreIncBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                    uint32_t BitWidth) {
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
+    return false;
+
+  return IncDecHelper<T, IncDecOp::Inc, PushVal::No>(S, OpPC, Ptr, CanOverflow,
+                                                     BitWidth);
+}
+
 /// 1) Pops a pointer from the stack
 /// 2) Load the value from the pointer
 /// 3) Writes the value decreased by one back to the pointer
@@ -808,6 +847,16 @@ bool Dec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   return IncDecHelper<T, IncDecOp::Dec, PushVal::Yes>(S, OpPC, Ptr,
                                                       CanOverflow);
 }
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool DecBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                 uint32_t BitWidth) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
+    return false;
+
+  return IncDecHelper<T, IncDecOp::Dec, PushVal::Yes>(S, OpPC, Ptr, CanOverflow,
+                                                      BitWidth);
+}
 
 /// 1) Pops a pointer from the stack
 /// 2) Load the value from the pointer
@@ -822,6 +871,17 @@ bool DecPop(InterpState &S, CodePtr OpPC, bool CanOverflow) {
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool DecPopBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                    uint32_t BitWidth) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
+    return false;
+
+  return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow,
+                                                     BitWidth);
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
@@ -829,6 +889,16 @@ bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool PreDecBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                    uint32_t BitWidth) {
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
+    return false;
+  return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow,
+                                                     BitWidth);
+}
+
 template <IncDecOp Op, PushVal DoPush>
 bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                        uint32_t FPOI) {
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 532c444..406feb5 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -612,12 +612,25 @@ class OverflowOpcode : Opcode {
   let HasGroup = 1;
 }
 
+class OverflowBitfieldOpcode : Opcode {
+  let Types = [AluTypeClass];
+  let Args = [ArgBool, ArgUint32];
+  let HasGroup = 1;
+}
+
 def Inc : OverflowOpcode;
+def IncBitfield : OverflowBitfieldOpcode;
 def IncPop : OverflowOpcode;
+def IncPopBitfield : OverflowBitfieldOpcode;
 def PreInc : OverflowOpcode;
+def PreIncBitfield : OverflowBitfieldOpcode;
+
 def Dec : OverflowOpcode;
+def DecBitfield : OverflowBitfieldOpcode;
 def DecPop : OverflowOpcode;
+def DecPopBitfield : OverflowBitfieldOpcode;
 def PreDec : OverflowOpcode;
+def PreDecBitfield : OverflowBitfieldOpcode;
 
 // Float increment and decrement.
 def Incf: FloatOpcode;
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
index 07e3de8..2b56c1e 100644
--- a/clang/lib/AST/StmtOpenACC.cpp
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -12,7 +12,9 @@
 
 #include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ExprCXX.h"
 #include "clang/AST/StmtCXX.h"
+
 using namespace clang;
 
 OpenACCComputeConstruct *
@@ -322,6 +324,38 @@ OpenACCAtomicConstruct *OpenACCAtomicConstruct::Create(
   return Inst;
 }
 
+const OpenACCAtomicConstruct::StmtInfo
+OpenACCAtomicConstruct::getAssociatedStmtInfo() const {
+  // This ends up being a vastly simplified version of SemaOpenACCAtomic, since
+  // it doesn't have to worry about erroring out, but we should do a lot of
+  // asserts to ensure we don't get off into the weeds.
+  assert(getAssociatedStmt() && "invalid associated stmt?");
+
+  switch (AtomicKind) {
+  case OpenACCAtomicKind::None:
+  case OpenACCAtomicKind::Write:
+  case OpenACCAtomicKind::Update:
+  case OpenACCAtomicKind::Capture:
+    assert(false && "Only 'read' has been implemented here");
+    return {};
+  case OpenACCAtomicKind::Read: {
+    // Read only supports the format 'v = x'; where both sides are a scalar
+    // expression. This can come in 2 forms; BinaryOperator or
+    // CXXOperatorCallExpr (rarely).
+    const Expr *AssignExpr = cast<const Expr>(getAssociatedStmt());
+    if (const auto *BO = dyn_cast<BinaryOperator>(AssignExpr)) {
+      assert(BO->getOpcode() == BO_Assign);
+      return {BO->getLHS()->IgnoreImpCasts(), BO->getRHS()->IgnoreImpCasts()};
+    }
+
+    const auto *OO = cast<CXXOperatorCallExpr>(AssignExpr);
+    assert(OO->getOperator() == OO_Equal);
+
+    return {OO->getArg(0)->IgnoreImpCasts(), OO->getArg(1)->IgnoreImpCasts()};
+  }
+  }
+}
+
 OpenACCCacheConstruct *OpenACCCacheConstruct::CreateEmpty(const ASTContext &C,
                                                           unsigned NumVars) {
   void *Mem =
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 2da7789..c18b2ea 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -2147,9 +2147,6 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   }
   case attr::AArch64VectorPcs: OS << "aarch64_vector_pcs"; break;
   case attr::AArch64SVEPcs: OS << "aarch64_sve_pcs"; break;
-  case attr::DeviceKernel:
-    OS << T->getAttr()->getSpelling();
-    break;
   case attr::IntelOclBicc:
     OS << "inteloclbicc";
     break;
diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index 33c2958..f5c8396 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -200,7 +200,7 @@ public:
     // a host function.
     if (HostTarget)
       return HostTarget->checkCallingConvention(CC);
-    return CCCR_Warning;
+    return CC == CC_DeviceKernel ? CCCR_OK : CCCR_Warning;
   }
 
   bool hasBitIntType() const override { return true; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 65e6a39..800262a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -1011,9 +1011,9 @@ public:
   }
 
   mlir::Attribute VisitCXXDefaultInitExpr(CXXDefaultInitExpr *die, QualType t) {
-    cgm.errorNYI(die->getBeginLoc(),
-                 "ConstExprEmitter::VisitCXXDefaultInitExpr");
-    return {};
+    // No need for a DefaultInitExprScope: we don't handle 'this' in a
+    // constant expression.
+    return Visit(die->getExpr(), t);
   }
 
   mlir::Attribute VisitExprWithCleanups(ExprWithCleanups *e, QualType t) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index ce4ae7e..385f89c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -553,12 +553,15 @@ public:
   }
 
   void VisitIfClause(const OpenACCIfClause &clause) {
-    if constexpr (isOneOfTypes<OpTy, mlir::acc::ParallelOp, mlir::acc::SerialOp,
-                               mlir::acc::KernelsOp, mlir::acc::InitOp,
-                               mlir::acc::ShutdownOp, mlir::acc::SetOp,
-                               mlir::acc::DataOp, mlir::acc::WaitOp,
-                               mlir::acc::HostDataOp, mlir::acc::EnterDataOp,
-                               mlir::acc::ExitDataOp, mlir::acc::UpdateOp>) {
+    if constexpr (isOneOfTypes<
+                      OpTy, mlir::acc::ParallelOp, mlir::acc::SerialOp,
+                      mlir::acc::KernelsOp, mlir::acc::InitOp,
+                      mlir::acc::ShutdownOp, mlir::acc::SetOp,
+                      mlir::acc::DataOp, mlir::acc::WaitOp,
+                      mlir::acc::HostDataOp, mlir::acc::EnterDataOp,
+                      mlir::acc::ExitDataOp, mlir::acc::UpdateOp,
+                      mlir::acc::AtomicReadOp, mlir::acc::AtomicWriteOp,
+                      mlir::acc::AtomicUpdateOp, mlir::acc::AtomicCaptureOp>) {
       operation.getIfCondMutable().append(
           createCondition(clause.getConditionExpr()));
     } else if constexpr (isCombinedType<OpTy>) {
@@ -1144,6 +1147,10 @@ EXPL_SPEC(mlir::acc::HostDataOp)
 EXPL_SPEC(mlir::acc::EnterDataOp)
 EXPL_SPEC(mlir::acc::ExitDataOp)
 EXPL_SPEC(mlir::acc::UpdateOp)
+EXPL_SPEC(mlir::acc::AtomicReadOp)
+EXPL_SPEC(mlir::acc::AtomicWriteOp)
+EXPL_SPEC(mlir::acc::AtomicCaptureOp)
+EXPL_SPEC(mlir::acc::AtomicUpdateOp)
 #undef EXPL_SPEC
 
 template <typename ComputeOp, typename LoopOp>
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index e89393c..02bb46d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -306,6 +306,29 @@ CIRGenFunction::emitOpenACCCacheConstruct(const OpenACCCacheConstruct &s) {
 
 mlir::LogicalResult
 CIRGenFunction::emitOpenACCAtomicConstruct(const OpenACCAtomicConstruct &s) {
-  cgm.errorNYI(s.getSourceRange(), "OpenACC Atomic Construct");
-  return mlir::failure();
+  // For now, we are only support 'read', so diagnose. We can switch on the kind
+  // later once we start implementing the other 3 forms.
+  if (s.getAtomicKind() != OpenACCAtomicKind::Read) {
+    cgm.errorNYI(s.getSourceRange(), "OpenACC Atomic Construct");
+    return mlir::failure();
+  }
+
+  // While Atomic is an 'associated statement' construct, it 'steals' the
+  // expression it is associated with rather than emitting it inside of it.  So
+  // it has custom emit logic.
+  mlir::Location start = getLoc(s.getSourceRange().getBegin());
+  OpenACCAtomicConstruct::StmtInfo inf = s.getAssociatedStmtInfo();
+  // Atomic 'read' only permits 'v = x', where v and x are both scalar L values.
+  // The getAssociatedStmtInfo strips off implicit casts, which includes
+  // implicit conversions and L-to-R-Value conversions, so we can just emit it
+  // as an L value.  The Flang implementation has no problem with different
+  // types, so it appears that the dialect can handle the conversions.
+  mlir::Value v = emitLValue(inf.V).getPointer();
+  mlir::Value x = emitLValue(inf.X).getPointer();
+  mlir::Type resTy = convertType(inf.V->getType());
+  auto op = mlir::acc::AtomicReadOp::create(builder, start, x, v, resTy,
+                                            /*ifCond=*/{});
+  emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
+                     s.clauses());
+  return mlir::success();
 }
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 6020684..c423c4b 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -234,9 +234,12 @@ public:
 };
 } // namespace
 
-static AllocTokenOptions getAllocTokenOptions(const CodeGenOptions &CGOpts) {
+static AllocTokenOptions getAllocTokenOptions(const LangOptions &LangOpts,
+                                              const CodeGenOptions &CGOpts) {
   AllocTokenOptions Opts;
-  Opts.MaxTokens = CGOpts.AllocTokenMax;
+  if (LangOpts.AllocTokenMode)
+    Opts.Mode = *LangOpts.AllocTokenMode;
+  Opts.MaxTokens = LangOpts.AllocTokenMax;
   Opts.Extended = CGOpts.SanitizeAllocTokenExtended;
   Opts.FastABI = CGOpts.SanitizeAllocTokenFastABI;
   return Opts;
@@ -430,12 +433,6 @@ static bool initTargetOptions(const CompilerInstance &CI,
   Options.NoInfsFPMath = LangOpts.NoHonorInfs;
   Options.NoNaNsFPMath = LangOpts.NoHonorNaNs;
   Options.NoZerosInBSS = CodeGenOpts.NoZeroInitializedInBSS;
-  Options.UnsafeFPMath = LangOpts.AllowFPReassoc && LangOpts.AllowRecip &&
-                         LangOpts.NoSignedZero && LangOpts.ApproxFunc &&
-                         (LangOpts.getDefaultFPContractMode() ==
-                              LangOptions::FPModeKind::FPM_Fast ||
-                          LangOpts.getDefaultFPContractMode() ==
-                              LangOptions::FPModeKind::FPM_FastHonorPragmas);
 
   Options.BBAddrMap = CodeGenOpts.BBAddrMap;
   Options.BBSections =
@@ -808,7 +805,7 @@ static void addSanitizers(const Triple &TargetTriple,
         // memory allocation function detection.
         MPM.addPass(InferFunctionAttrsPass());
       }
-      MPM.addPass(AllocTokenPass(getAllocTokenOptions(CodeGenOpts)));
+      MPM.addPass(AllocTokenPass(getAllocTokenOptions(LangOpts, CodeGenOpts)));
     }
   };
   if (ClSanitizeOnOptimizerEarlyEP) {
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e490b1c..3746bc04 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1325,22 +1325,29 @@ void CodeGenModule::Release() {
                               "tag-stack-memory-buildattr", 1);
 
   if (T.isARM() || T.isThumb() || T.isAArch64()) {
+    // Previously 1 is used and meant for the backed to derive the function
+    // attribute form it. 2 now means function attributes already set for all
+    // functions in this module, so no need to propagate those from the module
+    // flag. Value is only used in case of LTO module merge because the backend
+    // will see all required function attribute set already. Value is used
+    // before modules got merged. Any posive value means the feature is active
+    // and required binary markings need to be emit accordingly.
     if (LangOpts.BranchTargetEnforcement)
       getModule().addModuleFlag(llvm::Module::Min, "branch-target-enforcement",
-                                1);
+                                2);
     if (LangOpts.BranchProtectionPAuthLR)
       getModule().addModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr",
-                                1);
+                                2);
     if (LangOpts.GuardedControlStack)
-      getModule().addModuleFlag(llvm::Module::Min, "guarded-control-stack", 1);
+      getModule().addModuleFlag(llvm::Module::Min, "guarded-control-stack", 2);
     if (LangOpts.hasSignReturnAddress())
-      getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 1);
+      getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 2);
     if (LangOpts.isSignReturnAddressScopeAll())
       getModule().addModuleFlag(llvm::Module::Min, "sign-return-address-all",
-                                1);
+                                2);
     if (!LangOpts.isSignReturnAddressWithAKey())
       getModule().addModuleFlag(llvm::Module::Min,
-                                "sign-return-address-with-bkey", 1);
+                                "sign-return-address-with-bkey", 2);
 
     if (LangOpts.PointerAuthELFGOT)
       getModule().addModuleFlag(llvm::Module::Min, "ptrauth-elf-got", 1);
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0bc4b4b7..e4ad078 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -439,9 +439,11 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
     return;
 
   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
-  if (FD)
+  if (FD) {
     setFunctionDeclAttributes(FD, F, M);
-
+    if (FD->hasAttr<DeviceKernelAttr>() && !M.getLangOpts().OpenCL)
+      F->setCallingConv(getDeviceKernelCallingConv());
+  }
   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
     F->addFnAttr("amdgpu-ieee", "false");
 }
@@ -658,7 +660,7 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
   // kernel address (only the kernel descriptor).
   auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
                                    &Mod);
-  F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+  F->setCallingConv(getDeviceKernelCallingConv());
 
   llvm::AttrBuilder KernelAttrs(C);
   // FIXME: The invoke isn't applying the right attributes either
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 53f2fc4..f6715861 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -264,7 +264,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
       // And kernel functions are not subject to inlining
       F->addFnAttr(llvm::Attribute::NoInline);
       if (FD->hasAttr<CUDAGlobalAttr>()) {
-        F->setCallingConv(llvm::CallingConv::PTX_Kernel);
+        F->setCallingConv(getDeviceKernelCallingConv());
 
         for (auto IV : llvm::enumerate(FD->parameters()))
           if (IV.value()->hasAttr<CUDAGridConstantAttr>())
@@ -278,7 +278,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
   }
   // Attach kernel metadata directly if compiling for NVPTX.
   if (FD->hasAttr<DeviceKernelAttr>())
-    F->setCallingConv(llvm::CallingConv::PTX_Kernel);
+    F->setCallingConv(getDeviceKernelCallingConv());
 }
 
 void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 80e096e..15d0b35 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -64,6 +64,8 @@ public:
   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
                                  llvm::PointerType *T,
                                  QualType QT) const override;
+  void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
+                           CodeGen::CodeGenModule &M) const override;
 };
 class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
 public:
@@ -268,6 +270,22 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
       llvm::ConstantPointerNull::get(NPT), PT);
 }
 
+void CommonSPIRTargetCodeGenInfo::setTargetAttributes(
+    const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
+  if (M.getLangOpts().OpenCL || GV->isDeclaration())
+    return;
+
+  const FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
+  if (!FD)
+    return;
+
+  llvm::Function *F = dyn_cast<llvm::Function>(GV);
+  assert(F && "Expected GlobalValue to be a Function");
+
+  if (FD->hasAttr<DeviceKernelAttr>())
+    F->setCallingConv(getDeviceKernelCallingConv());
+}
+
 LangAS
 SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
                                                  const VarDecl *D) const {
@@ -292,19 +310,23 @@ SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
 
 void SPIRVTargetCodeGenInfo::setTargetAttributes(
     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
-  if (!M.getLangOpts().HIP ||
-      M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
-    return;
   if (GV->isDeclaration())
     return;
 
-  auto F = dyn_cast<llvm::Function>(GV);
-  if (!F)
+  const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
+  if (!FD)
     return;
 
-  auto FD = dyn_cast_or_null<FunctionDecl>(D);
-  if (!FD)
+  llvm::Function *F = dyn_cast<llvm::Function>(GV);
+  assert(F && "Expected GlobalValue to be a Function");
+
+  if (FD->hasAttr<DeviceKernelAttr>())
+    F->setCallingConv(getDeviceKernelCallingConv());
+
+  if (!M.getLangOpts().HIP ||
+      M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
     return;
+
   if (!FD->hasAttr<CUDAGlobalAttr>())
     return;
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a7310ba..caf7478 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9099,6 +9099,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   };
   auto ShouldForward = [&](const llvm::DenseSet<unsigned> &Set, Arg *A,
                            const ToolChain &TC) {
+    // CMake hack to avoid printing verbose informatoin for HIP non-RDC mode.
+    if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_Object)
+      return false;
     return (Set.contains(A->getOption().getID()) ||
             (A->getOption().getGroup().isValid() &&
              Set.contains(A->getOption().getGroup().getID()))) &&
@@ -9174,7 +9177,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
 
   CmdArgs.push_back(
       Args.MakeArgString("--host-triple=" + getToolChain().getTripleString()));
-  if (Args.hasArg(options::OPT_v))
+
+  // CMake hack, suppress passing verbose arguments for the special-case HIP
+  // non-RDC mode compilation. This confuses default CMake implicit linker
+  // argument parsing when the language is set to HIP and the system linker is
+  // also `ld.lld`.
+  if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_Object)
     CmdArgs.push_back("--wrapper-verbose");
   if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ))
     CmdArgs.push_back(
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index c97a9e8..25971d2 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3791,12 +3791,18 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts,
   if (Current.is(TT_FunctionDeclarationName))
     return true;
 
-  if (Current.isNoneOf(tok::identifier, tok::kw_operator))
+  if (!Current.Tok.getIdentifierInfo())
     return false;
 
   const auto *Prev = Current.getPreviousNonComment();
   assert(Prev);
 
+  if (Prev->is(tok::coloncolon))
+    Prev = Prev->Previous;
+
+  if (!Prev)
+    return false;
+
   const auto &Previous = *Prev;
 
   if (const auto *PrevPrev = Previous.getPreviousNonComment();
@@ -3845,8 +3851,6 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts,
 
   // Find parentheses of parameter list.
   if (Current.is(tok::kw_operator)) {
-    if (Line.startsWith(tok::kw_friend))
-      return true;
     if (Previous.Tok.getIdentifierInfo() &&
         Previous.isNoneOf(tok::kw_return, tok::kw_co_return)) {
       return true;
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index cb44568..d53b64a 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -1651,6 +1651,7 @@ ASTUnit *ASTUnit::LoadFromCompilerInvocationAction(
   AST->Reader = nullptr;
 
   // Create a file manager object to provide access to and cache the filesystem.
+  Clang->setVirtualFileSystem(AST->getVirtualFileSystemPtr());
   Clang->setFileManager(AST->getFileManagerPtr());
 
   // Create the source manager.
@@ -2290,6 +2291,7 @@ void ASTUnit::CodeComplete(
          "IR inputs not support here!");
 
   // Use the source and file managers that we were given.
+  Clang->setVirtualFileSystem(FileMgr->getVirtualFileSystemPtr());
   Clang->setFileManager(FileMgr);
   Clang->setSourceManager(SourceMgr);
 
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 5844366..374138f 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -160,8 +160,6 @@ bool CompilerInstance::createTarget() {
 }
 
 void CompilerInstance::setFileManager(IntrusiveRefCntPtr<FileManager> Value) {
-  if (!hasVirtualFileSystem())
-    setVirtualFileSystem(Value->getVirtualFileSystemPtr());
   assert(Value == nullptr ||
          getVirtualFileSystemPtr() == Value->getVirtualFileSystemPtr());
   FileMgr = std::move(Value);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 5bd15f5..d2cb751 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1833,10 +1833,6 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
        serializeSanitizerKinds(Opts.SanitizeAnnotateDebugInfo))
     GenerateArg(Consumer, OPT_fsanitize_annotate_debug_info_EQ, Sanitizer);
 
-  if (Opts.AllocTokenMax)
-    GenerateArg(Consumer, OPT_falloc_token_max_EQ,
-                std::to_string(*Opts.AllocTokenMax));
-
   if (!Opts.EmitVersionIdentMetadata)
     GenerateArg(Consumer, OPT_Qn);
 
@@ -2350,15 +2346,6 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
     }
   }
 
-  if (const auto *Arg = Args.getLastArg(options::OPT_falloc_token_max_EQ)) {
-    StringRef S = Arg->getValue();
-    uint64_t Value = 0;
-    if (S.getAsInteger(0, Value))
-      Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S;
-    else
-      Opts.AllocTokenMax = Value;
-  }
-
   Opts.EmitVersionIdentMetadata = Args.hasFlag(OPT_Qy, OPT_Qn, true);
 
   if (!LangOpts->CUDAIsDevice)
@@ -3966,6 +3953,29 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
 
   if (!Opts.RandstructSeed.empty())
     GenerateArg(Consumer, OPT_frandomize_layout_seed_EQ, Opts.RandstructSeed);
+
+  if (Opts.AllocTokenMax)
+    GenerateArg(Consumer, OPT_falloc_token_max_EQ,
+                std::to_string(*Opts.AllocTokenMax));
+
+  if (Opts.AllocTokenMode) {
+    StringRef S;
+    switch (*Opts.AllocTokenMode) {
+    case llvm::AllocTokenMode::Increment:
+      S = "increment";
+      break;
+    case llvm::AllocTokenMode::Random:
+      S = "random";
+      break;
+    case llvm::AllocTokenMode::TypeHash:
+      S = "typehash";
+      break;
+    case llvm::AllocTokenMode::TypeHashPointerSplit:
+      S = "typehashpointersplit";
+      break;
+    }
+    GenerateArg(Consumer, OPT_falloc_token_mode_EQ, S);
+  }
 }
 
 bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
@@ -4544,6 +4554,23 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   if (const Arg *A = Args.getLastArg(OPT_frandomize_layout_seed_EQ))
     Opts.RandstructSeed = A->getValue(0);
 
+  if (const auto *Arg = Args.getLastArg(options::OPT_falloc_token_max_EQ)) {
+    StringRef S = Arg->getValue();
+    uint64_t Value = 0;
+    if (S.getAsInteger(0, Value))
+      Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S;
+    else
+      Opts.AllocTokenMax = Value;
+  }
+
+  if (const auto *Arg = Args.getLastArg(options::OPT_falloc_token_mode_EQ)) {
+    StringRef S = Arg->getValue();
+    if (auto Mode = getAllocTokenModeFromString(S))
+      Opts.AllocTokenMode = Mode;
+    else
+      Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S;
+  }
+
   // Validate options for HLSL
   if (Opts.HLSL) {
     // TODO: Revisit restricting SPIR-V to logical once we've figured out how to
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 0daa20a..ed1169e 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -945,6 +945,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
 
     // Set the shared objects, these are reset when we finish processing the
     // file, otherwise the CompilerInstance will happily destroy them.
+    CI.setVirtualFileSystem(AST->getVirtualFileSystemPtr());
     CI.setFileManager(AST->getFileManagerPtr());
     CI.setSourceManager(AST->getSourceManagerPtr());
     CI.setPreprocessor(AST->getPreprocessorPtr());
diff --git a/clang/lib/Frontend/PrecompiledPreamble.cpp b/clang/lib/Frontend/PrecompiledPreamble.cpp
index 03f70b7..9bf18b4 100644
--- a/clang/lib/Frontend/PrecompiledPreamble.cpp
+++ b/clang/lib/Frontend/PrecompiledPreamble.cpp
@@ -479,16 +479,12 @@ llvm::ErrorOr<PrecompiledPreamble> PrecompiledPreamble::Build(
   Diagnostics->Reset();
   ProcessWarningOptions(*Diagnostics, Clang->getDiagnosticOpts(), *VFS);
 
-  VFS = createVFSFromCompilerInvocation(Clang->getInvocation(), *Diagnostics,
-                                        VFS);
-
   // Create a file manager object to provide access to and cache the filesystem.
-  Clang->setFileManager(
-      llvm::makeIntrusiveRefCnt<FileManager>(Clang->getFileSystemOpts(), VFS));
+  Clang->createVirtualFileSystem(VFS);
+  Clang->createFileManager();
 
   // Create the source manager.
-  Clang->setSourceManager(llvm::makeIntrusiveRefCnt<SourceManager>(
-      *Diagnostics, Clang->getFileManager()));
+  Clang->createSourceManager();
 
   auto PreambleDepCollector = std::make_shared<PreambleDependencyCollector>();
   Clang->addDependencyCollector(PreambleDepCollector);
diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h
index c4449c7..b377c17 100644
--- a/clang/lib/Headers/avx512ifmavlintrin.h
+++ b/clang/lib/Headers/avx512ifmavlintrin.h
@@ -37,6 +37,7 @@
 
 #endif
 
+#if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__))
 #define _mm_madd52hi_epu64(X, Y, Z)                                            \
   ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
                                           (__v2di)(Z)))
@@ -52,56 +53,83 @@
 #define _mm256_madd52lo_epu64(X, Y, Z)                                         \
   ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y),            \
                                           (__v4di)(Z)))
+#endif
+
+#if defined(__AVX512IFMA__)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
+}
+#endif
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_selectq_128(
-      __M, (__v2di)_mm_madd52hi_epu64(__W, __X, __Y), (__v2di)__W);
+      __M, (__v2di)__builtin_ia32_vpmadd52huq128(__W, __X, __Y), (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
   return (__m128i)__builtin_ia32_selectq_128(
-      __M, (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
+      __M, (__v2di)__builtin_ia32_vpmadd52huq128(__X, __Y, __Z),
       (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52hi_epu64(
     __m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_selectq_256(
-      __M, (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y), (__v4di)__W);
+      __M, (__v4di)__builtin_ia32_vpmadd52huq256(__W, __X, __Y), (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52hi_epu64(
     __mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
   return (__m256i)__builtin_ia32_selectq_256(
-      __M, (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
+      __M, (__v4di)__builtin_ia32_vpmadd52huq256(__X, __Y, __Z),
       (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_selectq_128(
-      __M, (__v2di)_mm_madd52lo_epu64(__W, __X, __Y), (__v2di)__W);
+      __M, (__v2di)__builtin_ia32_vpmadd52luq128(__W, __X, __Y), (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
   return (__m128i)__builtin_ia32_selectq_128(
-      __M, (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
+      __M, (__v2di)__builtin_ia32_vpmadd52luq128(__X, __Y, __Z),
       (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52lo_epu64(
     __m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_selectq_256(
-      __M, (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y), (__v4di)__W);
+      __M, (__v4di)__builtin_ia32_vpmadd52luq256(__W, __X, __Y), (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52lo_epu64(
     __mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
   return (__m256i)__builtin_ia32_selectq_256(
-      __M, (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
+      __M, (__v4di)__builtin_ia32_vpmadd52luq256(__X, __Y, __Z),
       (__v4di)_mm256_setzero_si256());
 }
 
diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
index a2ef601..e452d5f 100644
--- a/clang/lib/Headers/avxifmaintrin.h
+++ b/clang/lib/Headers/avxifmaintrin.h
@@ -31,6 +31,13 @@
                  __min_vector_width__(256)))
 #endif
 
+#if !defined(__AVX512IFMA__) && defined(__AVXIFMA__)
+#define _mm_madd52hi_epu64(X, Y, Z) _mm_madd52hi_avx_epu64(X, Y, Z)
+#define _mm_madd52lo_epu64(X, Y, Z) _mm_madd52lo_avx_epu64(X, Y, Z)
+#define _mm256_madd52hi_epu64(X, Y, Z) _mm256_madd52hi_avx_epu64(X, Y, Z)
+#define _mm256_madd52lo_epu64(X, Y, Z) _mm256_madd52lo_avx_epu64(X, Y, Z)
+#endif
+
 // must vex-encoding
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp
index cbdefaa..056e133 100644
--- a/clang/lib/Sema/HeuristicResolver.cpp
+++ b/clang/lib/Sema/HeuristicResolver.cpp
@@ -450,7 +450,12 @@ QualType HeuristicResolverImpl::resolveExprToType(const Expr *E) {
   if (const auto *CE = dyn_cast<CallExpr>(E)) {
     if (QualType Resolved = resolveTypeOfCallExpr(CE); !Resolved.isNull())
       return Resolved;
+
+    // Don't proceed to try resolveExprToDecls(), it would just call
+    // resolveTypeOfCallExpr() again.
+    return E->getType();
   }
+
   // Similarly, unwrapping a unary dereference operation does not work via
   // resolveExprToDecls.
   if (const auto *UO = dyn_cast<UnaryOperator>(E->IgnoreParenCasts())) {
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 215ac18..8ed3df7 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1484,6 +1484,13 @@ void Sema::ActOnEndOfTranslationUnit() {
       Consumer.CompleteTentativeDefinition(VD);
   }
 
+  // In incremental mode, tentative definitions belong to the current
+  // partial translation unit (PTU). Once they have been completed and
+  // emitted to codegen, drop them to prevent re-emission in future PTUs.
+  if (PP.isIncrementalProcessingEnabled())
+    TentativeDefinitions.erase(TentativeDefinitions.begin(ExternalSource.get()),
+                               TentativeDefinitions.end());
+
   for (auto *D : ExternalDeclarations) {
     if (!D || D->isInvalidDecl() || D->getPreviousDecl() || !D->isUsed())
       continue;
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 54cbfe4..a1163e9 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -417,8 +417,8 @@ class ConstraintSatisfactionChecker {
   const NamedDecl *Template;
   SourceLocation TemplateNameLoc;
   UnsignedOrNone PackSubstitutionIndex;
-
   ConstraintSatisfaction &Satisfaction;
+  bool BuildExpression;
 
 private:
   ExprResult
@@ -461,10 +461,11 @@ public:
   ConstraintSatisfactionChecker(Sema &SemaRef, const NamedDecl *Template,
                                 SourceLocation TemplateNameLoc,
                                 UnsignedOrNone PackSubstitutionIndex,
-                                ConstraintSatisfaction &Satisfaction)
+                                ConstraintSatisfaction &Satisfaction,
+                                bool BuildExpression)
       : S(SemaRef), Template(Template), TemplateNameLoc(TemplateNameLoc),
         PackSubstitutionIndex(PackSubstitutionIndex),
-        Satisfaction(Satisfaction) {}
+        Satisfaction(Satisfaction), BuildExpression(BuildExpression) {}
 
   ExprResult Evaluate(const NormalizedConstraint &Constraint,
                       const MultiLevelTemplateArgumentList &MLTAL);
@@ -821,9 +822,10 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
     Satisfaction.ContainsErrors = false;
     ExprResult Expr =
         ConstraintSatisfactionChecker(S, Template, TemplateNameLoc,
-                                      UnsignedOrNone(I), Satisfaction)
+                                      UnsignedOrNone(I), Satisfaction,
+                                      /*BuildExpression=*/false)
             .Evaluate(Constraint.getNormalizedPattern(), *SubstitutedArgs);
-    if (Expr.isUsable()) {
+    if (BuildExpression && Expr.isUsable()) {
       if (Out.isUnset())
         Out = Expr;
       else
@@ -834,7 +836,7 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
                                      Constraint.getBeginLoc(),
                                      FPOptionsOverride{});
     } else {
-      assert(!Satisfaction.IsSatisfied);
+      assert(!BuildExpression || !Satisfaction.IsSatisfied);
     }
     if (!Conjunction && Satisfaction.IsSatisfied) {
       Satisfaction.Details.erase(Satisfaction.Details.begin() +
@@ -985,7 +987,7 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
 
   ExprResult E = Evaluate(Constraint.getNormalizedConstraint(), MLTAL);
 
-  if (!E.isUsable()) {
+  if (E.isInvalid()) {
     Satisfaction.Details.insert(Satisfaction.Details.begin() + Size, ConceptId);
     return E;
   }
@@ -1041,7 +1043,7 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
   if (Conjunction && (!Satisfaction.IsSatisfied || Satisfaction.ContainsErrors))
     return LHS;
 
-  if (!Conjunction && LHS.isUsable() && Satisfaction.IsSatisfied &&
+  if (!Conjunction && !LHS.isInvalid() && Satisfaction.IsSatisfied &&
       !Satisfaction.ContainsErrors)
     return LHS;
 
@@ -1050,12 +1052,15 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
 
   ExprResult RHS = Evaluate(Constraint.getRHS(), MLTAL);
 
-  if (RHS.isUsable() && Satisfaction.IsSatisfied &&
+  if (!Conjunction && !RHS.isInvalid() && Satisfaction.IsSatisfied &&
       !Satisfaction.ContainsErrors)
     Satisfaction.Details.erase(Satisfaction.Details.begin() +
                                    EffectiveDetailEndIndex,
                                Satisfaction.Details.end());
 
+  if (!BuildExpression)
+    return Satisfaction.ContainsErrors ? ExprError() : ExprEmpty();
+
   if (!LHS.isUsable())
     return RHS;
 
@@ -1136,10 +1141,11 @@ static bool CheckConstraintSatisfaction(
                                     Template, /*CSE=*/nullptr,
                                     S.ArgPackSubstIndex);
 
-  ExprResult Res =
-      ConstraintSatisfactionChecker(S, Template, TemplateIDRange.getBegin(),
-                                    S.ArgPackSubstIndex, Satisfaction)
-          .Evaluate(*C, TemplateArgsLists);
+  ExprResult Res = ConstraintSatisfactionChecker(
+                       S, Template, TemplateIDRange.getBegin(),
+                       S.ArgPackSubstIndex, Satisfaction,
+                       /*BuildExpression=*/ConvertedExpr != nullptr)
+                       .Evaluate(*C, TemplateArgsLists);
 
   if (Res.isInvalid())
     return true;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 9475b8a..964a2a7 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5206,16 +5206,36 @@ static void handleCallConvAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 static void handleDeviceKernelAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   const auto *FD = dyn_cast_or_null<FunctionDecl>(D);
   bool IsFunctionTemplate = FD && FD->getDescribedFunctionTemplate();
-  if (S.getASTContext().getTargetInfo().getTriple().isNVPTX()) {
+  llvm::Triple Triple = S.getASTContext().getTargetInfo().getTriple();
+  const LangOptions &LangOpts = S.getLangOpts();
+  // OpenCL has its own error messages.
+  if (!LangOpts.OpenCL && FD && !FD->isExternallyVisible()) {
+    S.Diag(AL.getLoc(), diag::err_hidden_device_kernel) << FD;
+    AL.setInvalid();
+    return;
+  }
+  if (Triple.isNVPTX()) {
     handleGlobalAttr(S, D, AL);
   } else {
     // OpenCL C++ will throw a more specific error.
-    if (!S.getLangOpts().OpenCLCPlusPlus && (!FD || IsFunctionTemplate)) {
+    if (!LangOpts.OpenCLCPlusPlus && (!FD || IsFunctionTemplate)) {
       S.Diag(AL.getLoc(), diag::err_attribute_wrong_decl_type_str)
           << AL << AL.isRegularKeywordAttribute() << "functions";
+      AL.setInvalid();
+      return;
     }
     handleSimpleAttribute<DeviceKernelAttr>(S, D, AL);
   }
+  // TODO: isGPU() should probably return true for SPIR.
+  bool TargetDeviceEnvironment = Triple.isGPU() || Triple.isSPIR() ||
+                                 LangOpts.isTargetDevice() || LangOpts.OpenCL;
+  if (!TargetDeviceEnvironment) {
+    S.Diag(AL.getLoc(), diag::warn_cconv_unsupported)
+        << AL << (int)Sema::CallingConventionIgnoredReason::ForThisTarget;
+    AL.setInvalid();
+    return;
+  }
+
   // Make sure we validate the CC with the target
   // and warn/error if necessary.
   handleCallConvAttr(S, D, AL);
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 3bb8080..ee9b2b3 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2999,11 +2999,11 @@ bool SemaOpenACC::CreateReductionCombinerRecipe(
     BinOp = BinaryOperatorKind::BO_LT;
     break;
   case OpenACCReductionOperator::And:
+    BinOp = BinaryOperatorKind::BO_LAnd;
+    break;
   case OpenACCReductionOperator::Or:
-    // We just want a 'NYI' error in the backend, so leave an empty combiner
-    // recipe, and claim success.
-    CombinerRecipes.push_back({nullptr, nullptr, nullptr});
-    return false;
+    BinOp = BinaryOperatorKind::BO_LOr;
+    break;
   }
 
   // If VarTy is an array type, at the top level only, we want to do our
@@ -3068,8 +3068,21 @@ bool SemaOpenACC::CreateReductionCombinerRecipe(
                               : CombinerFailureKind::Assignment};
     }
     case OpenACCReductionOperator::And:
-    case OpenACCReductionOperator::Or:
-      llvm_unreachable("And/Or not implemented, but should fail earlier");
+    case OpenACCReductionOperator::Or: {
+      // These are done as LHS = LHS && RHS (or LHS = LHS || RHS). So after the
+      // binop, all we have to do is the assignment.
+      if (!BinOpRes.isUsable())
+        return {BinOpRes, CombinerFailureKind::BinOp};
+
+      // Build assignment.
+      ExprResult Assignment = SemaRef.BuildBinOp(SemaRef.getCurScope(), Loc,
+                                                 BinaryOperatorKind::BO_Assign,
+                                                 LHSDRE, BinOpRes.get(),
+                                                 /*ForFoldExpr=*/false);
+      return {Assignment, Assignment.isUsable()
+                              ? CombinerFailureKind::None
+                              : CombinerFailureKind::Assignment};
+    }
     case OpenACCReductionOperator::Invalid:
       llvm_unreachable("Invalid should have been caught above");
     }
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 7c1fb12..280b3c9 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -134,7 +134,6 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
   case ParsedAttr::AT_VectorCall:                                              \
   case ParsedAttr::AT_AArch64VectorPcs:                                        \
   case ParsedAttr::AT_AArch64SVEPcs:                                           \
-  case ParsedAttr::AT_DeviceKernel:                                            \
   case ParsedAttr::AT_MSABI:                                                   \
   case ParsedAttr::AT_SysVABI:                                                 \
   case ParsedAttr::AT_Pcs:                                                     \
@@ -3786,7 +3785,8 @@ static CallingConv getCCForDeclaratorChunk(
       }
     }
   }
-  for (const ParsedAttr &AL : D.getDeclSpec().getAttributes()) {
+  for (const ParsedAttr &AL : llvm::concat<ParsedAttr>(
+           D.getDeclSpec().getAttributes(), D.getAttributes())) {
     if (AL.getKind() == ParsedAttr::AT_DeviceKernel) {
       CC = CC_DeviceKernel;
       break;
@@ -7569,8 +7569,6 @@ static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) {
     return createSimpleAttr<AArch64SVEPcsAttr>(Ctx, Attr);
   case ParsedAttr::AT_ArmStreaming:
     return createSimpleAttr<ArmStreamingAttr>(Ctx, Attr);
-  case ParsedAttr::AT_DeviceKernel:
-    return createSimpleAttr<DeviceKernelAttr>(Ctx, Attr);
   case ParsedAttr::AT_Pcs: {
     // The attribute may have had a fixit applied where we treated an
     // identifier as a string literal.  The contents of the string are valid,
@@ -8809,16 +8807,6 @@ static void HandleHLSLParamModifierAttr(TypeProcessingState &State,
   }
 }
 
-static bool isMultiSubjectAttrAllowedOnType(const ParsedAttr &Attr) {
-  // The DeviceKernel attribute is shared for many targets, and
-  // it is only allowed to be a type attribute with the AMDGPU
-  // spelling, so skip processing the attr as a type attr
-  // unless it has that spelling.
-  if (Attr.getKind() != ParsedAttr::AT_DeviceKernel)
-    return true;
-  return DeviceKernelAttr::isAMDGPUSpelling(Attr);
-}
-
 static void processTypeAttrs(TypeProcessingState &state, QualType &type,
                              TypeAttrLocation TAL,
                              const ParsedAttributesView &attrs,
@@ -9072,8 +9060,6 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
         break;
       [[fallthrough]];
     FUNCTION_TYPE_ATTRS_CASELIST:
-      if (!isMultiSubjectAttrAllowedOnType(attr))
-        break;
 
       attr.setUsedAsTypeAttr();
 
diff --git a/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp b/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
index 5301f88..531c642b 100644
--- a/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
@@ -93,6 +93,7 @@ void ModelInjector::onBodySynthesis(const NamedDecl *D) {
 
   // The instance wants to take ownership, however DisableFree frontend option
   // is set to true to avoid double free issues
+  Instance.setVirtualFileSystem(CI.getVirtualFileSystemPtr());
   Instance.setFileManager(CI.getFileManagerPtr());
   Instance.setSourceManager(SM);
   Instance.setPreprocessor(CI.getPreprocessorPtr());
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index ea5a372..e8eef5e 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -446,6 +446,7 @@ bool FrontendActionFactory::runInvocation(
     DiagnosticConsumer *DiagConsumer) {
   // Create a compiler instance to handle the actual work.
   CompilerInstance Compiler(std::move(Invocation), std::move(PCHContainerOps));
+  Compiler.setVirtualFileSystem(Files->getVirtualFileSystemPtr());
   Compiler.setFileManager(Files);
 
   // The FrontendAction can have lifetime requirements for Compiler or its
diff --git a/clang/test/AST/ByteCode/bitfields.cpp b/clang/test/AST/ByteCode/bitfields.cpp
index df8d567..a583282 100644
--- a/clang/test/AST/ByteCode/bitfields.cpp
+++ b/clang/test/AST/ByteCode/bitfields.cpp
@@ -128,3 +128,82 @@ namespace NonConstBitWidth {
                 // both-note {{read of non-const variable}}
   };
 }
+
+namespace IncDecOverflow {
+  constexpr bool test1() {
+    struct {unsigned u: 5; } a {};
+    a.u--;
+    return a.u == 31;
+  }
+  static_assert(test1(), "");
+
+  constexpr bool test2() {
+    struct {unsigned u: 5; } a {};
+    --a.u;
+    return a.u == 31;
+  }
+  static_assert(test2(), "");
+
+  constexpr bool test3() {
+    int x = 0;
+    struct {unsigned u: 5; } a {};
+    x = a.u--;
+    return a.u == 31;
+  }
+  static_assert(test3(), "");
+
+  constexpr bool test4() {
+    int x = 0;
+    struct {unsigned u: 5; } a {};
+    x = --a.u;
+    return a.u == 31;
+  }
+  static_assert(test4(), "");
+
+  constexpr bool test5() {
+    struct {unsigned u: 5; } a {};
+    a.u = 31;
+    ++a.u;
+
+    return a.u == 0;
+  }
+  static_assert(test5(), "");
+
+  constexpr bool test6() {
+    struct {unsigned u: 5; } a {};
+    a.u = 31;
+    ++a.u;
+
+    return a.u == 0;
+  }
+  static_assert(test6(), "");
+
+  constexpr bool test7() {
+    struct {unsigned u: 5; } a {};
+    a.u = 31;
+    a.u++;
+
+    return a.u == 0;
+  }
+  static_assert(test7(), "");
+
+  constexpr bool test8() {
+    int x = 0;
+    struct {unsigned u: 5; } a {};
+    a.u = 31;
+    x = a.u++;
+
+    return a.u == 0;
+  }
+  static_assert(test8(), "");
+
+  constexpr bool test9() {
+    int x = 0;
+    struct {unsigned u: 5; } a {};
+    a.u = 31;
+    x = ++a.u;
+
+    return a.u == 0;
+  }
+  static_assert(test9(), "");
+}
diff --git a/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c b/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c
index 92eae6a..19362cf 100644
--- a/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c
+++ b/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fclangir -emit-cir -fdump-record-layouts %s -o %t.cir 1> %t.cirlayout
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fclangir -emit-cir -fdump-record-layouts %s -o %t.cir > %t.cirlayout
 // RUN: FileCheck --input-file=%t.cirlayout %s --check-prefix=CIR-LAYOUT
 // RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
 
 // RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
 // RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
 
-// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -fdump-record-layouts %s -o %t.ll 1> %t.ogcglayout
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -fdump-record-layouts %s -o %t.ll > %t.ogcglayout
 // RUN: FileCheck --input-file=%t.ogcglayout %s --check-prefix=OGCG-LAYOUT
 // RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
 
diff --git a/clang/test/CIR/CodeGen/struct-init.cpp b/clang/test/CIR/CodeGen/struct-init.cpp
index 2887e6f..cb50999 100644
--- a/clang/test/CIR/CodeGen/struct-init.cpp
+++ b/clang/test/CIR/CodeGen/struct-init.cpp
@@ -15,6 +15,16 @@ S partial_init = { 1 };
 // LLVM: @partial_init = global %struct.S { i32 1, i32 0, i32 0 }
 // OGCG: @partial_init = global %struct.S { i32 1, i32 0, i32 0 }
 
+struct StructWithDefaultInit {
+  int a = 2;
+};
+
+StructWithDefaultInit swdi = {};
+
+// CIR: cir.global external @swdi = #cir.const_record<{#cir.int<2> : !s32i}> : !rec_StructWithDefaultInit
+// LLVM: @swdi = global %struct.StructWithDefaultInit { i32 2 }, align 4
+// OGCG: @swdi = global %struct.StructWithDefaultInit { i32 2 }, align 4
+
 void init() {
   S s1 = {1, 2, 3};
   S s2 = {4, 5};
diff --git a/clang/test/CIR/CodeGenOpenACC/atomic-read.cpp b/clang/test/CIR/CodeGenOpenACC/atomic-read.cpp
new file mode 100644
index 0000000..9882f05
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/atomic-read.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+
+void use(int x, unsigned int y, float f) {
+  // CHECK: cir.func{{.*}}(%[[X_ARG:.*]]: !s32i{{.*}}, %[[Y_ARG:.*]]: !u32i{{.*}}, %[[F_ARG:.*]]: !cir.float{{.*}}){{.*}}{
+  // CHECK-NEXT: %[[X_ALLOC:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init]
+  // CHECK-NEXT: %[[Y_ALLOC:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["y", init]
+  // CHECK-NEXT: %[[F_ALLOC:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f", init]
+  // CHECK-NEXT: cir.store %[[X_ARG]], %[[X_ALLOC]] : !s32i, !cir.ptr<!s32i>
+  // CHECK-NEXT: cir.store %[[Y_ARG]], %[[Y_ALLOC]] : !u32i, !cir.ptr<!u32i>
+  // CHECK-NEXT: cir.store %[[F_ARG]], %[[F_ALLOC]] : !cir.float, !cir.ptr<!cir.float>
+
+  // CHECK-NEXT: acc.atomic.read %[[X_ALLOC]] = %[[Y_ALLOC]] : !cir.ptr<!s32i>, !cir.ptr<!u32i>, !s32i
+#pragma acc atomic read
+  x = y;
+
+  // CHECK-NEXT: %[[X_LOAD:.*]] = cir.load{{.*}} %[[X_ALLOC]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[X_CAST:.*]] = cir.cast integral %[[X_LOAD]] : !s32i -> !u32i
+  // CHECK-NEXT: %[[Y_LOAD:.*]] = cir.load{{.*}} %[[Y_ALLOC]] : !cir.ptr<!u32i>, !u32i
+  // CHECK-NEXT: %[[CMP:.*]] = cir.cmp(eq, %[[X_CAST]], %[[Y_LOAD]]) : !u32i, !cir.bool
+  // CHECK-NEXT: %[[CMP_CAST:.*]] = builtin.unrealized_conversion_cast %[[CMP]] : !cir.bool to i1
+  // CHECK-NEXT: acc.atomic.read if(%[[CMP_CAST]]) %[[F_ALLOC]] = %[[Y_ALLOC]] : !cir.ptr<!cir.float>, !cir.ptr<!u32i>, !cir.float
+#pragma acc atomic read if (x == y)
+  f = y;
+}
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
index c1c2e4b..53eba7b 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -480,7 +480,77 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -507,7 +577,77 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -1532,7 +1672,101 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -1576,7 +1810,101 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -2398,6 +2726,104 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -2446,6 +2872,104 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
index 853f345..63d6952 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 template<typename T>
 void acc_combined() {
   T someVar;
@@ -92,7 +92,18 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -106,7 +117,18 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -371,7 +393,41 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -401,7 +457,41 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -708,7 +798,45 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -744,7 +872,45 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
index 67e8460..9c1b161 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -14,8 +14,8 @@ struct HasOperatorsInline {
   HasOperatorsInline &operator&=(HasOperatorsInline& other);
   HasOperatorsInline &operator|=(HasOperatorsInline& other);
   HasOperatorsInline &operator^=(HasOperatorsInline& other);
-  bool &operator&&(HasOperatorsInline& other);
-  bool &operator||(HasOperatorsInline& other);
+  HasOperatorsInline &operator&&(HasOperatorsInline& other);
+  HasOperatorsInline &operator||(HasOperatorsInline& other);
   // For min/max
   bool operator<(HasOperatorsInline& other);
   HasOperatorsInline &operator=(HasOperatorsInline& other);
@@ -277,7 +277,8 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -286,7 +287,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -308,7 +309,8 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -318,7 +320,7 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
@@ -1254,7 +1256,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -1318,7 +1344,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -2121,6 +2171,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2198,6 +2277,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
index d74de82..78b43dd 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_combined() {
@@ -145,7 +145,18 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -159,7 +170,18 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -587,7 +609,41 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -617,7 +673,41 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -1116,7 +1206,45 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -1152,7 +1280,45 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
index a6df6c0..5b37071 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -15,8 +15,8 @@ HasOperatorsOutline &operator*=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator&=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator|=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
 // For min/max
 bool operator<(HasOperatorsOutline &, HasOperatorsOutline &);
 
@@ -276,7 +276,8 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -285,7 +286,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -307,7 +308,8 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -317,7 +319,7 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
@@ -1253,7 +1255,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -1317,7 +1343,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -2120,6 +2170,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2197,6 +2276,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
index d65d5d4..6ec1c43 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -485,7 +485,80 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -512,7 +585,80 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -1516,7 +1662,104 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -1546,7 +1789,104 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -2376,6 +2716,107 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -2424,6 +2865,107 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
index f32fa2d..7bd6f67 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -480,7 +480,77 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -507,7 +577,77 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -1532,7 +1672,101 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -1576,7 +1810,101 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -2398,6 +2726,104 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -2446,6 +2872,104 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
index 9f73367..13c335b 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   float someVar;
@@ -92,7 +92,19 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -106,7 +118,19 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -371,7 +395,42 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -401,7 +460,42 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -708,7 +802,46 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -744,7 +877,46 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
index ffd2631..6737821 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_compute() {
@@ -93,7 +93,18 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -107,7 +118,18 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -372,7 +394,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -402,7 +458,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -709,7 +799,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -745,7 +873,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
index 1e367ee..262fe98 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -14,8 +14,8 @@ struct HasOperatorsInline {
   HasOperatorsInline &operator&=(HasOperatorsInline& other);
   HasOperatorsInline &operator|=(HasOperatorsInline& other);
   HasOperatorsInline &operator^=(HasOperatorsInline& other);
-  bool &operator&&(HasOperatorsInline& other);
-  bool &operator||(HasOperatorsInline& other);
+  HasOperatorsInline &operator&&(HasOperatorsInline& other);
+  HasOperatorsInline &operator||(HasOperatorsInline& other);
   // For min/max
   bool operator<(HasOperatorsInline& other);
   HasOperatorsInline &operator=(HasOperatorsInline& other);
@@ -277,7 +277,8 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -286,7 +287,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -308,7 +309,8 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -318,7 +320,7 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
@@ -1254,7 +1256,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -1318,7 +1344,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -2121,6 +2171,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2198,6 +2277,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
index 2f42a5c..be7b123 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   int someVar;
@@ -144,7 +144,18 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -158,7 +169,18 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -586,7 +608,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -616,7 +672,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1115,7 +1205,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1151,7 +1279,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
index af7bcf3..fb6984f 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_compute() {
@@ -145,7 +145,18 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -159,7 +170,18 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -587,7 +609,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -617,7 +673,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1116,7 +1206,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1152,7 +1280,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
index ec890e2..3a80ed5 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -15,8 +15,8 @@ HasOperatorsOutline &operator*=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator&=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator|=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
 // For min/max
 bool operator<(HasOperatorsOutline &, HasOperatorsOutline &);
 
@@ -276,7 +276,8 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -285,7 +286,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 #pragma acc parallel  reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -307,7 +308,8 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -317,7 +319,7 @@ void acc_compute() {
   ;
 
 #pragma acc parallel  reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
@@ -1253,7 +1255,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -1317,7 +1343,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -2120,6 +2170,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2197,6 +2276,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
index 08daa70..9b10a29 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   unsigned int someVar;
@@ -143,7 +143,19 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -157,7 +169,19 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -585,7 +609,42 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -615,7 +674,42 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1114,7 +1208,46 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1150,7 +1283,46 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
index 1a77c0f..11ebd7b 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -480,7 +480,77 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -507,7 +577,77 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -1532,7 +1672,101 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -1576,7 +1810,101 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -2398,6 +2726,104 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -2446,6 +2872,104 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
index 7faef71..57cc1af 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_loop() {
@@ -93,7 +93,18 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -107,7 +118,18 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -372,7 +394,41 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -402,7 +458,41 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -709,7 +799,45 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -745,7 +873,45 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
index 43c9fbbc..8a5bf3e 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -14,8 +14,8 @@ struct HasOperatorsInline {
   HasOperatorsInline &operator&=(HasOperatorsInline& other);
   HasOperatorsInline &operator|=(HasOperatorsInline& other);
   HasOperatorsInline &operator^=(HasOperatorsInline& other);
-  bool &operator&&(HasOperatorsInline& other);
-  bool &operator||(HasOperatorsInline& other);
+  HasOperatorsInline &operator&&(HasOperatorsInline& other);
+  HasOperatorsInline &operator||(HasOperatorsInline& other);
   // For min/max
   bool operator<(HasOperatorsInline& other);
   HasOperatorsInline &operator=(HasOperatorsInline& other);
@@ -277,7 +277,8 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -286,7 +287,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -308,7 +309,8 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -318,7 +320,7 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
@@ -1254,7 +1256,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -1318,7 +1344,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -2121,6 +2171,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2198,6 +2277,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
index 5353218..f60dff9 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_loop() {
@@ -145,7 +145,18 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -159,7 +170,18 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -587,7 +609,41 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -617,7 +673,41 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -1116,7 +1206,45 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -1152,7 +1280,45 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
index e193cfa..8613bc8a 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -15,8 +15,8 @@ HasOperatorsOutline &operator*=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator&=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator|=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
 // For min/max
 bool operator<(HasOperatorsOutline &, HasOperatorsOutline &);
 
@@ -276,7 +276,8 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -285,7 +286,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -307,7 +308,8 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -317,7 +319,7 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
@@ -1253,7 +1255,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -1317,7 +1343,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -2120,6 +2170,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2197,6 +2276,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
index 20ad7a3..29f1b5f 100644
--- a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 // Note: unlike the 'private' recipe checks, this is just for spot-checking,
 // so this test isn't as comprehensive.  The same code paths are used for
@@ -753,8 +753,97 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
-// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
-// CHECK-NEXT: acc.yield
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_TLA_LOAD:.*]] = cir.load %[[LHSARG]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: %[[LHS_BOUND3_STRIDE:.*]] = cir.ptr_stride %[[LHS_TLA_LOAD]], %[[ITR3_LOAD]] : (!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !u64i) -> !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: %[[RHS_TLA_LOAD:.*]] = cir.load %[[RHSARG]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: %[[RHS_BOUND3_STRIDE:.*]] = cir.ptr_stride %[[RHS_TLA_LOAD]], %[[ITR3_LOAD]] : (!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !u64i) -> !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHS_BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[LHS_BOUND2_STRIDE:.*]] = cir.ptr_stride %[[LHS_BOUND3_STRIDE_DECAY]], %[[ITR2_LOAD]] : (!cir.ptr<!cir.ptr<!rec_NoOps>>, !u64i) -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[RHS_BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHS_BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[RHS_BOUND2_STRIDE:.*]] = cir.ptr_stride %[[RHS_BOUND3_STRIDE_DECAY]], %[[ITR2_LOAD]] : (!cir.ptr<!cir.ptr<!rec_NoOps>>, !u64i) -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_BOUND2_STRIDE_LOAD:.*]] = cir.load %[[LHS_BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_BOUND2_STRIDE_LOAD]], %[[ITR1_LOAD]] : (!cir.ptr<!rec_NoOps>, !u64i) -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[RHS_BOUND2_STRIDE_LOAD:.*]] = cir.load %[[RHS_BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_BOUND2_STRIDE_LOAD]], %[[ITR1_LOAD]] : (!cir.ptr<!rec_NoOps>, !u64i) -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_INT]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield %[[LHSARG]]
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: cir.scope {
diff --git a/clang/test/CodeGen/AArch64/sign-return-address.c b/clang/test/CodeGen/AArch64/sign-return-address.c
index 11dd683..2b505de 100644
--- a/clang/test/CodeGen/AArch64/sign-return-address.c
+++ b/clang/test/CodeGen/AArch64/sign-return-address.c
@@ -28,17 +28,17 @@
 // NONE-NOT:  !"branch-target-enforcement"
 // ALL-NOT:   !"branch-target-enforcement"
 // PART-NOT:  !"branch-target-enforcement"
-// BTE:       !{i32 8, !"branch-target-enforcement", i32 1}
+// BTE:       !{i32 8, !"branch-target-enforcement", i32 2}
 // B-KEY-NOT: !"branch-target-enforcement"
 
 // NONE-NOT:  !"sign-return-address"
-// ALL:   !{i32 8, !"sign-return-address", i32 1}
-// PART:  !{i32 8, !"sign-return-address", i32 1}
+// ALL:   !{i32 8, !"sign-return-address", i32 2}
+// PART:  !{i32 8, !"sign-return-address", i32 2}
 // BTE-NOT:   !"sign-return-address"
-// B-KEY: !{i32 8, !"sign-return-address", i32 1}
+// B-KEY: !{i32 8, !"sign-return-address", i32 2}
 
 // NONE-NOT:  !"sign-return-address-all"
-// ALL:   !{i32 8, !"sign-return-address-all", i32 1}
+// ALL:   !{i32 8, !"sign-return-address-all", i32 2}
 // PART-NOT:  !"sign-return-address-all"
 // BTE-NOT:   !"sign-return-address-all"
 // B-KEY-NOT: !"sign-return-address-all"
@@ -47,6 +47,6 @@
 // ALL-NOT:   !"sign-return-address-with-bkey"
 // PART-NOT:  !"sign-return-address-with-bkey"
 // BTE-NOT:   !"sign-return-address-with-bkey"
-// B-KEY: !{i32 8, !"sign-return-address-with-bkey", i32 1}
+// B-KEY: !{i32 8, !"sign-return-address-with-bkey", i32 2}
 
 void foo() {}
diff --git a/clang/test/CodeGen/arm-branch-protection-attr-2.c b/clang/test/CodeGen/arm-branch-protection-attr-2.c
index fad5dc0..5391537 100644
--- a/clang/test/CodeGen/arm-branch-protection-attr-2.c
+++ b/clang/test/CodeGen/arm-branch-protection-attr-2.c
@@ -23,16 +23,16 @@
 // NONE-NOT: !"branch-target-enforcement"
 // PART-NOT: !"branch-target-enforcement"
 // ALL-NOT:  !"branch-target-enforcement"
-// BTE:      !{i32 8, !"branch-target-enforcement", i32 1}
+// BTE:      !{i32 8, !"branch-target-enforcement", i32 2}
 
 // NONE-NOT: !"sign-return-address"
-// PART:     !{i32 8, !"sign-return-address", i32 1}
-// ALL:      !{i32 8, !"sign-return-address", i32 1}
+// PART:     !{i32 8, !"sign-return-address", i32 2}
+// ALL:      !{i32 8, !"sign-return-address", i32 2}
 // BTE-NOT:  !"sign-return-address"
 
 // NONE-NOT: !"sign-return-address-all", i32 0}
 // PART-NOT: !"sign-return-address-all", i32 0}
-// ALL:      !{i32 8, !"sign-return-address-all", i32 1}
+// ALL:      !{i32 8, !"sign-return-address-all", i32 2}
 // BTE-NOT:  !"sign-return-address-all", i32 0}
 
 void foo() {}
diff --git a/clang/test/Driver/fsanitize-alloc-token.c b/clang/test/Driver/fsanitize-alloc-token.c
index 2964f60..6d8bda1 100644
--- a/clang/test/Driver/fsanitize-alloc-token.c
+++ b/clang/test/Driver/fsanitize-alloc-token.c
@@ -41,3 +41,14 @@
 // CHECK-MAX: "-falloc-token-max=42"
 // RUN: not %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -falloc-token-max=-1 %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-MAX %s
 // CHECK-INVALID-MAX: error: invalid value
+
+// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=increment %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-INCREMENT %s
+// CHECK-MODE-INCREMENT: "-falloc-token-mode=increment"
+// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=random %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-RANDOM %s
+// CHECK-MODE-RANDOM: "-falloc-token-mode=random"
+// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=typehash %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-TYPEHASH %s
+// CHECK-MODE-TYPEHASH: "-falloc-token-mode=typehash"
+// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=typehashpointersplit %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-TYPEHASHPTRSPLIT %s
+// CHECK-MODE-TYPEHASHPTRSPLIT: "-falloc-token-mode=typehashpointersplit"
+// RUN: not %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=asdf %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-MODE %s
+// CHECK-INVALID-MODE: error: invalid value 'asdf'
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 2f40fd4..840334e 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -214,3 +214,10 @@
 // AMDGCNSPIRV: {{".*clang-offload-bundler.*"}} "-type=o"
 // AMDGCNSPIRV-SAME: "-targets={{.*}}hipv4-spirv64-amd-amdhsa--amdgcnspirv,hipv4-amdgcn-amd-amdhsa--gfx900"
 // AMDGCNSPIRV-SAME: "-input=[[AMDGCNSPV_CO]]" "-input=[[GFX900_CO]]"
+
+// Check verbose printing with the new driver.
+// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -nogpulib -nogpuinc \
+// RUN:   --offload-new-driver --offload-arch=gfx908 -v %s 2>&1 | FileCheck %s --check-prefix=VERBOSE
+// VERBOSE: clang-linker-wrapper
+// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v
+// VERBOSE-NOT: --wrapper-verbose
diff --git a/clang/test/Frontend/arm-ignore-branch-protection-option.c b/clang/test/Frontend/arm-ignore-branch-protection-option.c
index 99a2acc..45bdb37 100644
--- a/clang/test/Frontend/arm-ignore-branch-protection-option.c
+++ b/clang/test/Frontend/arm-ignore-branch-protection-option.c
@@ -15,4 +15,4 @@ __attribute__((target("arch=cortex-m0"))) void f() {}
 // CHECK-NOT:  attributes { {{.*}} "branch-target-enforcement"
 
 /// Check that there are branch protection module attributes despite the warning.
-// CHECK: !{i32 8, !"branch-target-enforcement", i32 1}
+// CHECK: !{i32 8, !"branch-target-enforcement", i32 2}
diff --git a/clang/test/Interpreter/pretty-print.c b/clang/test/Interpreter/pretty-print.c
index 588df70..d0712fb 100644
--- a/clang/test/Interpreter/pretty-print.c
+++ b/clang/test/Interpreter/pretty-print.c
@@ -75,9 +75,10 @@ int * ptr = (int*)0x123; ptr
 int * null_ptr = (int*)0; null_ptr
 // CHECK-NEXT: (int *) 0x0
 
+union U { int I; float F; } u; u.I = 12; u.I
+// CHECK-NEXT: (int) 12
+
 // TODO: _Bool, _Complex, _Atomic, and _BitInt
-// union U { int I; float F; } u; u.I = 12; u.I
-// TODO-CHECK-NEXT: (int) 12
 // struct S1{} s1; s1
 // TODO-CHECK-NEXT: (S1 &) @0x{{[0-9a-f]+}}
 
@@ -86,4 +87,21 @@ int * null_ptr = (int*)0; null_ptr
 // E.d
 // TODO-CHECK-NEXT: (int) 22
 
+// -----------------------------------------------------------------------------
+// Tentative definition handling (C99 6.9.2)
+// Verify that multiple distinct tentative definitions across inputs no longer
+// conflict. Each variable should emit correctly in its own incremental module.
+// -----------------------------------------------------------------------------
+
+int t1;
+int t2;
+int t3;
+t1 = 1; t2 = 2; t3 = 3;
+t1 + t2 + t3
+// CHECK-NEXT: (int) 6
+
+// A redefinition of an existing tentative variable should still fail.
+int t1;
+// expected-error {{duplicate definition of symbol '_t1'}}
+
 %quit
diff --git a/clang/test/Sema/callingconv-devicekernel.cpp b/clang/test/Sema/callingconv-devicekernel.cpp
new file mode 100644
index 0000000..f5da873
--- /dev/null
+++ b/clang/test/Sema/callingconv-devicekernel.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda- -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple spir64 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple spirv64 -fsyntax-only -verify %s
+
+[[clang::device_kernel]] void kernel1() {}
+
+namespace {
+[[clang::device_kernel]] void kernel2() {} // expected-error {{'kernel2' is specified as a device kernel but it is not externally visible}}
+}
+
+namespace ns {
+  [[clang::device_kernel]] void kernel3() {}
+}
+
+[[clang::device_kernel]] static void kernel4() {} // expected-error {{'kernel4' is specified as a device kernel but it is not externally visible}}
diff --git a/clang/test/Sema/callingconv.c b/clang/test/Sema/callingconv.c
index f0b8b80..28342b5 100644
--- a/clang/test/Sema/callingconv.c
+++ b/clang/test/Sema/callingconv.c
@@ -55,6 +55,10 @@ int __attribute__((aarch64_vector_pcs)) aavpcs(void); // expected-warning {{'aar
 int __attribute__((aarch64_sve_pcs)) aasvepcs(void);  // expected-warning {{'aarch64_sve_pcs' calling convention is not supported for this target}}
 
 int __attribute__((amdgpu_kernel)) amdgpu_kernel(void); // expected-warning {{'amdgpu_kernel' calling convention is not supported for this target}}
+int __attribute__((device_kernel)) device_kernel(void) { // expected-warning {{'device_kernel' calling convention is not supported for this target}}
+}
+int __attribute__((sycl_kernel)) sycl_kernel(void) { // expected-warning {{'sycl_kernel' attribute ignored}}
+}
 
 // PR6361
 void ctest3();
diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
index 137f46e..289059e 100644
--- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
+++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
@@ -157,66 +157,55 @@ static_assert(And1<S, S>() == 1);
 // FIXME: The diagnostics are not so great
 static_assert(And1<int>() == 1); // expected-error {{no matching function for call to 'And1'}}
                                  // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int>]}}
-                                 // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
-                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                 // expected-note@#and1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And1<S, int>() == 1); // expected-error {{no matching function for call to 'And1'}}
                                    // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <S, int>]}}
-                                   // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And1<int, S>() == 1); // expected-error {{no matching function for call to 'And1'}}
                                    // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int, S>]}}
-                                   // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And2<S>() == 2);
 static_assert(And2<S, S>() == 2);
 static_assert(And2<int>() == 2);  // expected-error {{no matching function for call to 'And2'}}
                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
-                                  // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
-                                  // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                  // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 
 static_assert(And2<int, int>() == 2);  // expected-error {{no matching function for call to 'And2'}}
                                       // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}} \
-                                      // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                      // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And2<S, int>() == 2); // expected-error {{no matching function for call to 'And2'}}
                                    // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
-                                   // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
-                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And2<int, S>() == 2); // expected-error {{no matching function for call to 'And2'}}
                                    // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
-                                   // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
-                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And3<S>() == 3);
 static_assert(And3<S, S>() == 3);
 static_assert(And3<int>() == 3);   // expected-error {{no matching function for call to 'And3'}}
                                    // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
-                                   // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 
 static_assert(And3<int, int>() == 3);  // expected-error {{no matching function for call to 'And3'}}
                                       // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
-                                      // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
-                                     // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                      // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 
 static_assert(And3<S, int>() == 3); // expected-error {{no matching function for call to 'And3'}}
                                    // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}}
-                                   // expected-note@#and3 {{because 'typename U::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 
 static_assert(And3<int, S>() == 3); // expected-error {{no matching function for call to 'And3'}}
                                    // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
-                                   // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 
 static_assert(Or1<>() == 1); // expected-error {{no matching function for call to 'Or1'}}
@@ -227,8 +216,7 @@ static_assert(Or1<S, int>() == 1);
 static_assert(Or1<S, S>() == 1);
 static_assert(Or1<int>() == 1); // expected-error {{no matching function for call to 'Or1'}}
                                 // expected-note@#or1 {{candidate template ignored: constraints not satisfied}}
-                                // expected-note@#or1 {{because 'typename T::type' does not satisfy 'C'}}
-                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                // expected-note@#or1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(Or2<S>() == 2);
 static_assert(Or2<int, S>() == 2);
@@ -236,16 +224,14 @@ static_assert(Or2<S, int>() == 2);
 static_assert(Or2<S, S>() == 2);
 static_assert(Or2<int>() == 2); // expected-error {{no matching function for call to 'Or2'}}
                                 // expected-note@#or2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
-                                // expected-note@#or2 {{because 'typename T::type' does not satisfy 'C'}}
-                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                // expected-note@#or2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 static_assert(Or3<S>() == 3);
 static_assert(Or3<int, S>() == 3);
 static_assert(Or3<S, int>() == 3);
 static_assert(Or3<S, S>() == 3);
 static_assert(Or3<int>() == 3); // expected-error {{no matching function for call to 'Or3'}}
                                 // expected-note@#or3 {{candidate template ignored: constraints not satisfied}}
-                                // expected-note@#or3 {{because 'typename T::type' does not satisfy 'C'}}
-                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                // expected-note@#or3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 }
 
 namespace bool_conversion_break {
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index 73dce93..5e1bce5 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -82,7 +82,6 @@ auto it = begin(rng); // #BEGIN_CALL
 // expected-error@#BEGIN_CALL {{no matching function for call to 'begin'}}
 // expected-note@#NOTINF_BEGIN {{candidate function}}
 // expected-note@#INF_BEGIN{{candidate template ignored: constraints not satisfied}}
-// expected-note@#INF_BEGIN{{because 'Inf auto' does not satisfy 'Inf}}
 }
 } // namespace DirectRecursiveCheck
 
diff --git a/clang/tools/clang-fuzzer/Dockerfile b/clang/tools/clang-fuzzer/Dockerfile
deleted file mode 100644
index 1ddf829..0000000
--- a/clang/tools/clang-fuzzer/Dockerfile
+++ /dev/null
@@ -1,41 +0,0 @@
-#===- llvm/tools/clang/tools/clang-fuzzer ---------------------------------===//
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-#===----------------------------------------------------------------------===//
-# Produces an image that builds clang-proto-fuzzer
-FROM ubuntu:16.04
-RUN apt-get update -y
-RUN apt-get install -y autoconf automake libtool curl make g++ unzip wget git \
-    binutils liblzma-dev libz-dev python-all cmake ninja-build subversion \
-    pkg-config docbook2x
-
-WORKDIR /root
-
-# Get protobuf
-RUN wget -qO- https://github.com/google/protobuf/releases/download/v3.3.0/protobuf-cpp-3.3.0.tar.gz | tar zxf -
-RUN cd protobuf-3.3.0 && ./autogen.sh && ./configure && make -j $(nproc) && make check -j $(nproc) && make install && ldconfig
-# Get LLVM
-RUN svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm
-RUN cd llvm/tools && svn co http://llvm.org/svn/llvm-project/cfe/trunk clang -r $(cd ../ && svn info | grep Revision | awk '{print $2}')
-RUN cd llvm/projects && svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk compiler-rt -r $(cd ../ && svn info | grep Revision | awk '{print $2}')
-# Build plain LLVM (stage 0)
-RUN mkdir build0 && cd build0 && cmake -GNinja -DCMAKE_BUILD_TYPE=Release ../llvm && ninja
-# Configure instrumented LLVM (stage 1)
-RUN mkdir build1 && cd build1 && cmake -GNinja -DCMAKE_BUILD_TYPE=Release ../llvm \
-    -DLLVM_ENABLE_ASSERTIONS=ON \
-    -DCMAKE_C_COMPILER=`pwd`/../build0/bin/clang \
-    -DCMAKE_CXX_COMPILER=`pwd`/../build0/bin/clang++ \
-    -DLLVM_USE_SANITIZE_COVERAGE=YES \
-    -DLLVM_USE_SANITIZER=Address -DCLANG_ENABLE_PROTO_FUZZER=ON
-# Build the fuzzers
-RUN cd build1 && ninja clang-fuzzer
-RUN cd build1 && ninja clang-objc-fuzzer
-RUN cd build1 && ninja clang-proto-fuzzer
-RUN cd build1 && ninja clang-proto-to-cxx
-RUN cd build1 && ninja clang-loop-proto-to-cxx
-RUN cd build1 && ninja clang-loop-proto-to-llvm
-RUN cd build1 && ninja clang-loop-proto-fuzzer
-RUN cd build1 && ninja clang-llvm-proto-fuzzer
diff --git a/clang/tools/clang-fuzzer/README.txt b/clang/tools/clang-fuzzer/README.txt
index eec4a9e..218c544 100644
--- a/clang/tools/clang-fuzzer/README.txt
+++ b/clang/tools/clang-fuzzer/README.txt
@@ -99,9 +99,6 @@ Example:
     -DCLANG_ENABLE_PROTO_FUZZER=ON
   ninja clang-proto-fuzzer clang-proto-to-cxx
 
-This directory also contains a Dockerfile which sets up all required
-dependencies and builds the fuzzers.
-
 ============================
  Running clang-proto-fuzzer
 ============================
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index 16abeb1..4e66485 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -114,6 +114,7 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
 
   // Set up compilation.
   std::unique_ptr<CompilerInstance> CI(new CompilerInstance());
+  CI->setVirtualFileSystem(FM->getVirtualFileSystemPtr());
   CI->setFileManager(FM);
   CI->createDiagnostics();
   if (!CI->hasDiagnostics())
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index ca99940..f363738 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1129,11 +1129,6 @@ TEST_F(TokenAnnotatorTest, UnderstandsOverloadedOperators) {
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   // Not TT_FunctionDeclarationName.
   EXPECT_TOKEN(Tokens[3], tok::kw_operator, TT_Unknown);
-
-  Tokens = annotate("SomeAPI::operator()();");
-  ASSERT_EQ(Tokens.size(), 9u) << Tokens;
-  // Not TT_FunctionDeclarationName.
-  EXPECT_TOKEN(Tokens[2], tok::kw_operator, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, OverloadedOperatorInTemplate) {
diff --git a/clang/unittests/Sema/HeuristicResolverTest.cpp b/clang/unittests/Sema/HeuristicResolverTest.cpp
index a00632f..c592e74 100644
--- a/clang/unittests/Sema/HeuristicResolverTest.cpp
+++ b/clang/unittests/Sema/HeuristicResolverTest.cpp
@@ -524,6 +524,28 @@ TEST(HeuristicResolver, MemberExpr_HangIssue126536) {
       cxxDependentScopeMemberExpr(hasMemberName("foo")).bind("input"));
 }
 
+TEST(HeuristicResolver, MemberExpr_HangOnLongCallChain) {
+  const size_t CallChainLength = 50;
+  std::string Code = R"cpp(
+    template <typename T>
+    void foo(T t) {
+      t
+    )cpp";
+  for (size_t I = 0; I < CallChainLength; ++I)
+    Code.append(".method()\n");
+  Code.append(R"cpp(
+      .lastMethod();
+    }
+  )cpp");
+  // Test that resolution of a name whose base is a long call chain
+  // does not hang. Note that the hang for which this is a regression
+  // test is finite (exponential runtime in the length of the chain),
+  // so a "failure" here manifests as abnormally long runtime.
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("lastMethod")).bind("input"));
+}
+
 TEST(HeuristicResolver, MemberExpr_DefaultTemplateArgument) {
   std::string Code = R"cpp(
     struct Default {
diff --git a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
index aa32bb3..4523af3 100644
--- a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
+++ b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
@@ -49,6 +49,8 @@ private:
   std::vector<std::string> &Deps;
 };
 
+// FIXME: Use the regular Service/Worker/Collector APIs instead of
+//        reimplementing the action.
 class TestDependencyScanningAction : public tooling::ToolAction {
 public:
   TestDependencyScanningAction(std::vector<std::string> &Deps) : Deps(Deps) {}
@@ -59,6 +61,7 @@ public:
                      DiagnosticConsumer *DiagConsumer) override {
     CompilerInstance Compiler(std::move(Invocation),
                               std::move(PCHContainerOps));
+    Compiler.setVirtualFileSystem(FileMgr->getVirtualFileSystemPtr());
     Compiler.setFileManager(FileMgr);
 
     Compiler.createDiagnostics(DiagConsumer, /*ShouldOwnClient=*/false);
diff --git a/clang/unittests/Tooling/Syntax/TokensTest.cpp b/clang/unittests/Tooling/Syntax/TokensTest.cpp
index 6094177..47184cb 100644
--- a/clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -134,6 +134,7 @@ public:
         FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release());
     CompilerInstance Compiler(std::move(CI));
     Compiler.setDiagnostics(Diags);
+    Compiler.setVirtualFileSystem(FS);
     Compiler.setFileManager(FileMgr);
     Compiler.setSourceManager(SourceMgr);
 
diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
index 400a0d5..b2be64f 100644
--- a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -153,6 +153,7 @@ SyntaxTreeTest::buildTree(StringRef Code, const TestClangConfig &ClangConfig) {
       FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release());
   CompilerInstance Compiler(Invocation);
   Compiler.setDiagnostics(Diags);
+  Compiler.setVirtualFileSystem(FS);
   Compiler.setFileManager(FileMgr);
   Compiler.setSourceManager(SourceMgr);
 
diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h
index d28f73f..ac119af 100644
--- a/compiler-rt/lib/builtins/assembly.h
+++ b/compiler-rt/lib/builtins/assembly.h
@@ -337,7 +337,8 @@
 #endif
 #endif
 
-#if defined(__ASSEMBLER__) && (defined(__i386__) || defined(__amd64__))
+#if defined(__ASSEMBLER__) && (defined(__i386__) || defined(__amd64__)) &&     \
+    !defined(__arm64ec__)
 .att_syntax
 #endif
 
diff --git a/flang/docs/AssumedRank.md b/flang/docs/AssumedRank.md
index c5d2c3e..0154adc 100644
--- a/flang/docs/AssumedRank.md
+++ b/flang/docs/AssumedRank.md
@@ -101,9 +101,9 @@ Assumed-rank dummies are also represented in the
 represent assumed-rank in procedure characteristics.
 
 ### Runtime Representation of Assumed-Ranks
-Assumed-ranks are implemented as CFI_cdesc_t (18.5.3) with the addition of an
-f18 specific addendum when required for the type. This is the usual f18
-descriptor, and no changes is required to represent assumed-ranks in this data
+Assumed-ranks are implemented as CFI_cdesc_t (18.5.3) with the addition of a
+Flang specific addendum when required for the type. This is the usual Flang
+descriptor, and no changes are required to represent assumed-ranks in this data
 structure. In fact, there is no difference between the runtime descriptor
 created for an assumed shape and the runtime descriptor created when the
 corresponding entity is passed as an assumed-rank.
diff --git a/flang/docs/C++17.md b/flang/docs/C++17.md
index f36110a..9137827 100644
--- a/flang/docs/C++17.md
+++ b/flang/docs/C++17.md
@@ -6,7 +6,7 @@
   
 -->
 
-# C++14/17 features used in f18
+# C++14/17 features used in Flang
 
 ```{contents}
 ---
@@ -27,7 +27,7 @@ out the details of how our C++ code should look and gives
 guidance about feature usage.
 
 We have chosen to use some features of the recent C++17
-language standard in f18.
+language standard in Flang.
 The most important of these are:
 * sum types (discriminated unions) in the form of `std::variant`
 * `using` template parameter packs
@@ -41,7 +41,7 @@ in this list because it's not particularly well known.)
 ## Sum types
 
 First, some background information to explain the need for sum types
-in f18.
+in Flang.
 
 Fortran is notoriously problematic to lex and parse, as tokenization
 depends on the state of the partial parse;
@@ -57,7 +57,7 @@ a unified lexer/parser.
 We have chosen to do so because it is simpler and should reduce
 both initial bugs and long-term maintenance.
 
-Specifically, f18's parser uses the technique of recursive descent with
+Specifically, Flang's parser uses the technique of recursive descent with
 backtracking.
 It is constructed as the incremental composition of pure parsing functions
 that each, when given a context (location in the input stream plus some state),
@@ -73,7 +73,7 @@ of Fortran.
 
 The specification of Fortran uses a form of BNF with alternatives,
 optional elements, sequences, and lists.  Each of these constructs
-in the Fortran grammar maps directly in the f18 parser to both
+in the Fortran grammar maps directly in Flang's parser to both
 the means of combining other parsers as alternatives, &c., and to
 the declarations of the parse tree data structures that represent
 the results of successful parses.
@@ -87,10 +87,10 @@ The bounded polymorphism supplied by the C++17 `std::variant` fits
 those needs exactly.
 For example, production R502 in Fortran defines the top-level
 program unit of Fortran as being a function, subroutine, module, &c.
-The `struct ProgramUnit` in the f18 parse tree header file
+`struct ProgramUnit` in the Flang parse tree header file
 represents each program unit with a member that is a `std::variant`
 over the six possibilities.
-Similarly, the parser for that type in the f18 grammar has six alternatives,
+Similarly, the parser for that type in Flang's grammar has six alternatives,
 each of which constructs an instance of `ProgramUnit` upon the result of
 parsing a `Module`, `FunctionSubprogram`, and so on.
 
@@ -99,7 +99,7 @@ parse is typically implemented with overloaded functions.
 A function instantiated on `ProgramUnit` will use `std::visit` to
 identify the right alternative and perform the right actions.
 The call to `std::visit` must pass a visitor that can handle all
-of the possibilities, and f18 will fail to build if one is missing.
+of the possibilities, and Flang will fail to build if one is missing.
 
 Were we unable to use `std::variant` directly, we would likely
 have chosen to implement a local `SumType` replacement; in the
diff --git a/flang/docs/C++style.md b/flang/docs/C++style.md
index cbb96f1..a4ca962 100644
--- a/flang/docs/C++style.md
+++ b/flang/docs/C++style.md
@@ -30,7 +30,7 @@ is clear on usage, follow it.
   is pretty good and comes with lots of justifications for its rules.
 * Reasonable exceptions to these guidelines can be made.
 * Be aware of some workarounds for known issues in older C++ compilers that should
-  still be able to compile f18. They are listed at the end of this document.
+  still be able to compile Flang. They are listed at the end of this document.
 
 ## In particular:
 
@@ -261,7 +261,7 @@ move semantics, member access, and comparison for equality; suitable for use in
 `std::variant<>`.
 * `std::unique_ptr<>`: A nullable pointer with ownership, null by default,
 not copyable, reassignable.
-F18 has a helpful `Deleter<>` class template that makes `unique_ptr<>`
+Flang has a helpful `Deleter<>` class template that makes `unique_ptr<>`
 easier to use with forward-referenced data types.
 * `std::shared_ptr<>`: A nullable pointer with shared ownership via reference
 counting, null by default, shallowly copyable, reassignable, and slow.
@@ -312,9 +312,9 @@ Consistency is one of many aspects in the pursuit of clarity,
 but not an end in itself.
 
 ## C++ compiler bug workarounds
-Below is a list of workarounds for C++ compiler bugs met with f18 that, even
-if the bugs are fixed in latest C++ compiler versions, need to be applied so
-that all desired tool-chains can compile f18.
+Below is a list of workarounds for C++ compiler bugs encountered when building
+Flang. Even if the bugs are fixed in latest C++ compiler versions, these need to
+be applied so that all desired tool-chains can compile Flang.
 
 ### Explicitly move noncopyable local variable into optional results
 
@@ -338,7 +338,7 @@ std::optional<CantBeCopied> fooOK() {
 }
 ```
 The underlying bug is actually not specific to `std::optional` but this is the most common
-case in f18 where the issue may occur. The actual bug can be reproduced with any class `B`
+case in Flang where the issue may occur. The actual bug can be reproduced with any class `B`
 that has a perfect forwarding constructor taking `CantBeCopied` as argument:
 `template<typename CantBeCopied> B(CantBeCopied&& x) x_{std::forward<CantBeCopied>(x)} {}`.
 In such scenarios, Ubuntu 18.04 g++ fails to instantiate the move constructor
diff --git a/flang/docs/Calls.md b/flang/docs/Calls.md
index f518dc0..f27af1a 100644
--- a/flang/docs/Calls.md
+++ b/flang/docs/Calls.md
@@ -529,7 +529,7 @@ PGI passes host instance links in descriptors in additional arguments
 that are not always successfully forwarded across implicit interfaces,
 sometimes leading to crashes when they turn out to be needed.
 
-F18 will manage a pool of trampolines in its runtime support library
+Flang will manage a pool of trampolines in its runtime support library
 that can be used to pass internal procedures as effective arguments
 to F77ish procedures, so that
 a bare code address can serve to represent the effective argument.
@@ -569,14 +569,14 @@ Fortran 2018 explicitly enables us to do this with a correction to Fortran
 2003 in 4.3.4(5).
 
 Last, there must be reasonably permanent naming conventions used
-by the F18 runtime library for those unrestricted specific intrinsic
+by Flang's runtime library for those unrestricted specific intrinsic
 functions (table 16.2 in 16.8) and extensions that can be passed as
 arguments.
 
 In these cases where external naming is at the discretion
 of the implementation, we should use names that are not in the C language
 user namespace, begin with something that identifies
-the current incompatible version of F18, the module, the submodule, and
+the current incompatible version of Flang, the module, the submodule, and
 elemental SIMD width, and are followed by the external name.
 The parts of the external name can be separated by some character that
 is acceptable for use in LLVM IR and assembly language but not in user
diff --git a/flang/docs/Character.md b/flang/docs/Character.md
index 4e1d407..96e0a06 100644
--- a/flang/docs/Character.md
+++ b/flang/docs/Character.md
@@ -6,7 +6,7 @@
 
 -->
 
-# Implementation of `CHARACTER` types in f18
+# Implementation of `CHARACTER` types in Flang
 
 ```{contents}
 ---
@@ -16,7 +16,7 @@ local:
 
 ## Kinds and Character Sets
 
-The f18 compiler and runtime support three kinds of the intrinsic
+The Flang compiler and runtime support three kinds of the intrinsic
 `CHARACTER` type of Fortran 2018.
 The default (`CHARACTER(KIND=1)`) holds 8-bit character codes;
 `CHARACTER(KIND=2)` holds 16-bit character codes;
@@ -108,12 +108,12 @@ The result of `//` may be used
 * as the value of a specifier of an I/O statement,
 * or as the value of a statement function.
 
-The f18 compiler has a general (but slow) means of implementing concatenation
+The Flang compiler has a general (but slow) means of implementing concatenation
 and a specialized (fast) option to optimize the most common case.
 
 ### General concatenation
 
-In the most general case, the f18 compiler's generated code and
+In the most general case, Flang's generated code and
 runtime support library represent the result as a deferred-length allocatable
 `CHARACTER` temporary scalar or array variable that is initialized
 as a zero-length array by `AllocatableInitCharacter()`
diff --git a/flang/docs/DoConcurrent.md b/flang/docs/DoConcurrent.md
index bd1008a..eba2656 100644
--- a/flang/docs/DoConcurrent.md
+++ b/flang/docs/DoConcurrent.md
@@ -280,7 +280,8 @@ Specifically, an easy means is required that stipulates that localization
 should apply at most only to the obvious cases of local non-pointer
 non-allocatable scalars.
 
-In the LLVM Fortran compiler project (a/k/a "flang", "f18") we considered
+In the LLVM Fortran compiler project (now known as "flang", previously also
+known as "f18") we considered
 several solutions to this problem.
 1. Add syntax (e.g., `DO PARALLEL` or `DO CONCURRENT() DEFAULT(PARALLEL)`)
    by which one can inform the compiler that it should localize only
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 420b751..6d87209 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -84,7 +84,7 @@ end
   be "local identifiers" and should be distinct in the "inclusive
   scope" -- i.e., not scoped by `BLOCK` constructs.
   As most (but not all) compilers implement `BLOCK` scoping of construct
-  names, so does f18, with a portability warning.
+  names, so does Flang, with a portability warning.
 * 15.6.4 paragraph 2 prohibits an implicitly typed statement function
   from sharing the same name as a symbol in its scope's host, if it
   has one.
@@ -153,7 +153,7 @@ end
   that a call to intrinsic module procedure `ieee_support_halting` with
   a constant argument has a compile time constant result in `constant
   expression` and `specification expression` contexts. In compilations
-  where this information is not known at compile time, f18 generates code
+  where this information is not known at compile time, Flang generates code
   to determine the absence or presence of this capability at runtime.
   A call to `ieee_support_halting` in contexts that the standard requires
   to be constant will generate a compilation error. `ieee_support_standard`
@@ -366,7 +366,7 @@ end
 * The legacy extension intrinsic functions `IZEXT` and `JZEXT`
   are supported; `ZEXT` has different behavior with various older
   compilers, so it is not supported.
-* f18 doesn't impose a limit on the number of continuation lines
+* Flang doesn't impose a limit on the number of continuation lines
   allowed for a single statement.
 * When a type-bound procedure declaration statement has neither interface
   nor attributes, the "::" before the bindings is optional, even
@@ -553,7 +553,7 @@ end
 * Fortran explicitly ignores type declaration statements when they
   attempt to type the name of a generic intrinsic function (8.2 p3).
   One can declare `CHARACTER::COS` and still get a real result
-  from `COS(3.14159)`, for example.  f18 will complain when a
+  from `COS(3.14159)`, for example.  Flang will complain when a
   generic intrinsic function's inferred result type does not
   match an explicit declaration.  This message is a warning.
 
@@ -570,7 +570,7 @@ end
 
 ## Standard features that might as well not be
 
-* f18 supports designators with constant expressions, properly
+* Flang supports designators with constant expressions, properly
   constrained, as initial data targets for data pointers in
   initializers of variable and component declarations and in
   `DATA` statements; e.g., `REAL, POINTER :: P => T(1:10:2)`.
@@ -587,8 +587,8 @@ end
 * The standard doesn't explicitly require that a named constant that
   appears as part of a complex-literal-constant be a scalar, but
   most compilers emit an error when an array appears.
-  f18 supports them with a portability warning.
-* f18 does not enforce a blanket prohibition against generic
+  Flang supports them with a portability warning.
+* Flang does not enforce a blanket prohibition against generic
   interfaces containing a mixture of functions and subroutines.
   We allow both to appear, unlike several other Fortran compilers.
   This is especially desirable when two generics of the same
@@ -655,7 +655,7 @@ end
   treat them as references to implicitly typed local variables, and
   load uninitialized values.
 
-  In f18, we chose to emit an error message for this case since the standard
+  In Flang, we chose to emit an error message for this case since the standard
   is unclear, the usage is not portable, and the issue can be easily resolved
   by adding a declaration.
 
@@ -686,7 +686,7 @@ end
 
 * When a `DATA` statement in a `BLOCK` construct could be construed as
   either initializing a host-associated object or declaring a new local
-  initialized object, f18 interprets the standard's classification of
+  initialized object, Flang interprets the standard's classification of
   a `DATA` statement as being a "declaration" rather than a "specification"
   construct, and notes that the `BLOCK` construct is defined as localizing
   names that have specifications in the `BLOCK` construct.
@@ -703,7 +703,7 @@ end subroutine
   Other Fortran compilers disagree with each other in their interpretations
   of this example.
   The precedent among the most commonly used compilers
-  agrees with f18's interpretation: a `DATA` statement without any other
+  agrees with Flang's interpretation: a `DATA` statement without any other
   specification of the name refers to the host-associated object.
 
 * Many Fortran compilers allow a non-generic procedure to be `USE`-associated
@@ -729,7 +729,7 @@ module m2
 end module
 ```
 
-  This case elicits a warning from f18, as it should not be treated
+  This case elicits a warning from Flang, as it should not be treated
   any differently than the same case with the non-generic procedure of
   the same name being defined in the same scope rather than being
   `USE`-associated into it, which is explicitly non-conforming in the
@@ -747,7 +747,7 @@ end module
   symbols, much less appear in specification inquiries, and there are
   application codes that expect exterior symbols whose names match
   components to be visible in a derived-type definition's default initialization
-  expressions, and so f18 follows that precedent.
+  expressions, and so Flang follows that precedent.
 
 * 19.3.1p1 "Within its scope, a local identifier of an entity of class (1)
   or class (4) shall not be the same as a global identifier used in that scope..."
@@ -769,17 +769,17 @@ end module
   left-hand side for a pointer assignment statement, and we emit a
   portability warning when it is not.
 
-* F18 allows a `USE` statement to reference a module that is defined later
+* Flang allows a `USE` statement to reference a module that is defined later
   in the same compilation unit, so long as mutual dependencies do not form
   a cycle.
   This feature forestalls any risk of such a `USE` statement reading an
   obsolete module file from a previous compilation and then overwriting
   that file later.
 
-* F18 allows `OPTIONAL` dummy arguments to interoperable procedures
+* Flang allows `OPTIONAL` dummy arguments to interoperable procedures
   unless they are `VALUE` (C865).
 
-* F18 processes the `NAMELIST` group declarations in a scope after it
+* Flang processes the `NAMELIST` group declarations in a scope after it
   has resolved all of the names in that scope.  This means that names
   that appear before their local declarations do not resolve to host
   associated objects and do not elicit errors about improper redeclarations
@@ -862,11 +862,11 @@ print *, [(j,j=1,10)]
 
 * The Fortran standard doesn't mention integer overflow explicitly. In many cases,
   however, integer overflow makes programs non-conforming.
-  F18 follows other widely-used Fortran compilers. Specifically, f18 assumes
+  Flang follows other widely-used Fortran compilers. Specifically, Flang assumes
   integer overflow never occurs in address calculations and increment of
   do-variable unless the option `-fwrapv` is enabled.
 
-* Two new ieee_round_type values were added in f18 beyond the four values
+* Two new ieee_round_type values were added in Flang beyond the four values
   defined in f03 and f08: ieee_away and ieee_other. Contemporary hardware
   typically does not have support for these rounding modes;
   ieee_support_rounding calls for these values return false.
diff --git a/flang/docs/FortranForCProgrammers.md b/flang/docs/FortranForCProgrammers.md
index 135e6b7..9023fdc 100644
--- a/flang/docs/FortranForCProgrammers.md
+++ b/flang/docs/FortranForCProgrammers.md
@@ -304,7 +304,7 @@ Preprocessing behavior varies across implementations and one should not depend o
 much portability.
 Preprocessing is typically requested by the use of a capitalized filename
 suffix (e.g., "foo.F90") or a compiler command line option.
-(Since the F18 compiler always runs its built-in preprocessing stage,
+(Since Flang always runs its built-in preprocessing stage,
 no special option or filename suffix is required.)
 
 ## "Object Oriented" Programming
diff --git a/flang/docs/FortranIR.md b/flang/docs/FortranIR.md
index f9f8f64..7f3c7b2 100644
--- a/flang/docs/FortranIR.md
+++ b/flang/docs/FortranIR.md
@@ -171,7 +171,7 @@ FIR is intentionally similar to SIL from the statement level up to the level of
 Program, procedure, region, and basic block all leverage code from LLVM, in much the same way as SIL. These data structures have significant investment and engineering behind their use in compilers, and it makes sense to leverage that work.
 
 * Pro: Uses LLVM data structures, pervasive in compiler projects such as LLVM, SIL, etc.
-* Pro: Get used to seeing and using LLVM, as f18 aims to be an LLVM project
+* Pro: Get used to seeing and using LLVM, as Flang aims to be an LLVM project
 * Con: Uses LLVM data structures, which the project has been avoiding
 
 #### Alternative: C++ Standard Template Library
diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md
index 79af788..2d28342 100644
--- a/flang/docs/GettingInvolved.md
+++ b/flang/docs/GettingInvolved.md
@@ -41,7 +41,7 @@ Contributions to Flang are done using GitHub Pull Requests and follow the
 ### Flang Slack Workspace
 
 -   There is a Slack workspace dedicated to Flang.
--   There are a number of topic-oriented channels available (e.g., #driver, #f18-semantics, #fir).
+-   There are a number of topic-oriented channels available (e.g., #driver, #fir).
 -   Add yourself via the *[invitation link](https://join.slack.com/t/flang-compiler/shared_invite/zt-2pcn51lh-VrRQL_YUOkxA_1CEfMGQhw "title")*
 
 ## Calls
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 34b6559..bfda5f3 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -19,7 +19,7 @@ of functions or subroutines with similar interfaces as an aid to
 comprehension beyond that which might be gained from the standard's
 alphabetical list.
 
-A brief status of intrinsic procedure support in f18 is also given at the end.
+A brief status of intrinsic procedure support in Flang is also given at the end.
 
 Few procedures are actually described here apart from their interfaces; see the
 Fortran 2018 standard (section 16) for the complete story.
@@ -733,20 +733,20 @@ In case the invocation would be an error if the procedure were the intrinsic
 leaves two choices to the compiler: emit an error about the intrinsic invocation,
 or consider this is an external procedure and emit no error.
 
-f18 will always consider this case to be the intrinsic and emit errors, unless the procedure
+Flang will always consider this case to be the intrinsic and emit errors, unless the procedure
 is used as a function (resp. subroutine) and the intrinsic is a subroutine (resp. function).
 The table below gives some examples of decisions made by Fortran compilers in such case.
 
 | What is ACOS ?     | Bad intrinsic call       | External with warning |  External no warning | Other error |
 | --- | --- | --- | --- | --- |
-| `print*, ACOS()`     | gfortran, nag, xlf, f18  |  ifort                |  nvfortran           | |
-| `print*, ACOS(I)`    | gfortran, nag, xlf, f18  |  ifort                |  nvfortran           | |
-| `print*, ACOS(X=I)`  | gfortran, nag, xlf, f18  |  ifort                |                      | nvfortran (keyword on implicit extrenal )|
-| `print*, ACOS(X, X)` | gfortran, nag, xlf, f18  |  ifort                |  nvfortran           | |
-| `CALL ACOS(X)`       |                          |                       |  gfortran, nag, xlf, nvfortran, ifort, f18  | |
+| `print*, ACOS()`     | gfortran, nag, xlf, flang  |  ifort                |  nvfortran           | |
+| `print*, ACOS(I)`    | gfortran, nag, xlf, flang  |  ifort                |  nvfortran           | |
+| `print*, ACOS(X=I)`  | gfortran, nag, xlf, flang  |  ifort                |                      | nvfortran (keyword on implicit extrenal )|
+| `print*, ACOS(X, X)` | gfortran, nag, xlf, flang  |  ifort                |  nvfortran           | |
+| `CALL ACOS(X)`       |                          |                       |  gfortran, nag, xlf, nvfortran, ifort, flang  | |
 
 
-The rationale for f18 behavior is that when referring to a procedure with an
+The rationale for Flang behavior is that when referring to a procedure with an
 argument number or type that does not match the intrinsic specification, it seems safer to block
 the rather likely case where the user is using the intrinsic the wrong way.
 In case the user wanted to refer to an external function, he can add an explicit EXTERNAL
@@ -759,13 +759,13 @@ Also note that in general, the standard gives the compiler the right to consider
 any procedure that is not explicitly external as a non standard intrinsic (section 4.2 point 4).
 So it is highly advised for the programmer to use EXTERNAL statements to prevent any ambiguity.
 
-## Intrinsic Procedure Support in f18
-This section gives an overview of the support inside f18 libraries for the
+## Intrinsic Procedure Support in Flang
+This section gives an overview of the support inside Flang libraries for the
 intrinsic procedures listed above.
-It may be outdated, refer to f18 code base for the actual support status.
+It may be outdated, refer to Flang code base for the actual support status.
 
 ### Semantic Analysis
-F18 semantic expression analysis phase detects intrinsic procedure references,
+Flang semantic expression analysis phase detects intrinsic procedure references,
 validates the argument types and deduces the return types.
 This phase currently supports all the intrinsic procedures listed above but the ones in the table below.
 
@@ -789,17 +789,17 @@ Constant Expressions may be used to define kind arguments. Therefore, the semant
 expression analysis phase must be able to fold references to intrinsic functions
 listed in section 10.1.12.
 
-F18 intrinsic function folding is either performed by implementations directly
-operating on f18 scalar types or by using host runtime functions and
-host hardware types. F18 supports folding elemental intrinsic functions over
+Flang intrinsic function folding is either performed by implementations directly
+operating on Flang scalar types or by using host runtime functions and
+host hardware types. Flang supports folding elemental intrinsic functions over
 arrays when an implementation is provided for the scalars (regardless of whether
 it is using host hardware types or not).
 The status of intrinsic function folding support is given in the sub-sections below.
 
 #### Intrinsic Functions with Host Independent Folding Support
-Implementations using f18 scalar types enables folding intrinsic functions
-on any host and with any possible type kind supported by f18. The intrinsic functions
-listed below are folded using host independent implementations.
+Implementations using Flang scalar types enables folding intrinsic functions
+on any host and with any possible type kind supported by Flang. The intrinsic
+functions listed below are folded using host independent implementations.
 
 | Return Type | Intrinsic Functions with Host Independent Folding Support|
 | --- | --- |
@@ -810,12 +810,12 @@ listed below are folded using host independent implementations.
 
 #### Intrinsic Functions with Host Dependent Folding Support
 Implementations using the host runtime may not be available for all supported
-f18 types depending on the host hardware types and the libraries available on the host.
+Flang types depending on the hardware type of the host and the libraries available on it.
 The actual support on a host depends on what the host hardware types are.
 The list below gives the functions that are folded using host runtime and the related C/C++ types.
-F18 automatically detects if these types match an f18 scalar type. If so,
-folding of the intrinsic functions will be possible for the related f18 scalar type,
-otherwise an error message will be produced by f18 when attempting to fold related intrinsic functions.
+Flang automatically detects if these types match an Flang scalar type. If so,
+folding of the intrinsic functions will be possible for the related Flang scalar type,
+otherwise an error message will be produced by Flang when attempting to fold related intrinsic functions.
 
 | C/C++ Host Type | Intrinsic Functions with Host Standard C++ Library Based Folding Support |
 | --- | --- |
@@ -823,17 +823,17 @@ otherwise an error message will be produced by f18 when attempting to fold relat
 | std::complex for float, double and long double| ACOS, ACOSH, ASIN, ASINH, ATAN, ATANH, COS, COSH, EXP, LOG, SIN, SINH, SQRT, TAN, TANH |
 
 On top of the default usage of C++ standard library functions for folding described
-in the table above, it is possible to compile f18 evaluate library with
+in the table above, it is possible to compile Flang evaluate library with
 [libpgmath](https://github.com/flang-compiler/flang/tree/master/runtime/libpgmath)
 so that it can be used for folding. To do so, one must have a compiled version
 of the libpgmath library available on the host and add
-`-DLIBPGMATH_DIR=<path to the compiled shared libpgmath library>` to the f18 cmake command.
+`-DLIBPGMATH_DIR=<path to the compiled shared libpgmath library>` to the Flang cmake command.
 
 Libpgmath comes with real and complex functions that replace C++ standard library
 float and double functions to fold all the intrinsic functions listed in the table above.
-It has no long double versions. If the host long double matches an f18 scalar type,
+It has no long double versions. If the host long double matches a Flang scalar type,
 C++ standard library functions will still be used for folding expressions with this scalar type.
-Libpgmath adds the possibility to fold the following functions for f18 real scalar
+Libpgmath adds the possibility to fold the following functions for Flang's real scalar
 types related to host float and double types.
 
 | C/C++ Host Type | Additional Intrinsic Function Folding Support with Libpgmath (Optional) |
@@ -841,10 +841,10 @@ types related to host float and double types.
 |float and double| BESSEL_J0, BESSEL_J1, BESSEL_JN (elemental only), BESSEL_Y0, BESSEL_Y1, BESSEL_Yn (elemental only), DERFC_SCALED, ERFC_SCALED, QERFC_SCALED |
 
 Libpgmath comes in three variants (precise, relaxed and fast). So far, only the
-precise version is used for intrinsic function folding in f18. It guarantees the greatest numerical precision.
+precise version is used for intrinsic function folding in Flang. It guarantees the greatest numerical precision.
 
 ### Intrinsic Functions with Missing Folding Support
-The following intrinsic functions are allowed in constant expressions but f18
+The following intrinsic functions are allowed in constant expressions but Flang
 is not yet able to fold them. Note that there might be constraints on the arguments
 so that these intrinsics can be used in constant expressions (see section 10.1.12 of Fortran 2018 standard).
 
@@ -1133,8 +1133,8 @@ end program rename_proc
 - **Standard:**  GNU extension
 - **Class:**     function
 - **Syntax:**    result = `SECNDS(refTime)`
-- **Arguments:** 
- 
+- **Arguments:**
+
 | ARGUMENT  | INTENT |      TYPE     |          KIND           |           Description                    |
 |-----------|--------|---------------|-------------------------|------------------------------------------|
 | `refTime` | `IN`   | `REAL, scalar`| REAL(KIND=4), required  | Reference time in seconds since midnight |
@@ -1157,16 +1157,16 @@ END PROGRAM example_secnds
 since midnight minus a user-supplied reference time `refTime`. Uses `REAL(KIND=8)` for higher precision.
 
 #### Usage and Info
-- **Standard:** PGI extension  
-- **Class:**     function  
-- **Syntax:**    result = `DSECNDS(refTime)`  
-- **Arguments:** 
+- **Standard:** PGI extension
+- **Class:**     function
+- **Syntax:**    result = `DSECNDS(refTime)`
+- **Arguments:**
 
 | ARGUMENT  | INTENT |      TYPE     |          KIND           |           Description                    |
 |-----------|--------|---------------|-------------------------|------------------------------------------|
 | `refTime` | `IN`   | `REAL, scalar`| REAL(KIND=8), required  | Reference time in seconds since midnight |
 
-- **Return Value:** REAL(KIND=8), scalar — seconds elapsed since `refTime`.  
+- **Return Value:** REAL(KIND=8), scalar — seconds elapsed since `refTime`.
 - **Purity:** Impure
 
 #### Example
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index f96d222..9f7c10c 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -98,6 +98,13 @@ public:
   mlir::Type getElementOrSequenceType() const {
     return hlfir::getFortranElementOrSequenceType(getType());
   }
+  /// Return the fir.class or fir.box type needed to describe this entity.
+  fir::BaseBoxType getBoxType() const {
+    if (isBoxAddressOrValue())
+      return llvm::cast<fir::BaseBoxType>(fir::unwrapRefType(getType()));
+    const bool isVolatile = fir::isa_volatile_type(getType());
+    return fir::BoxType::get(getElementOrSequenceType(), isVolatile);
+  }
 
   bool hasLengthParameters() const {
     mlir::Type eleTy = getFortranElementType();
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index fc6eedc..86502c6 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -1249,6 +1249,41 @@ def fir_IsAssumedSizeOp : fir_SimpleOp<"is_assumed_size", [NoMemoryEffect]> {
   let results = (outs BoolLike);
 }
 
+def fir_AssumedSizeExtentOp : fir_SimpleOneResultOp<"assumed_size_extent", [NoMemoryEffect]> {
+  let summary = "get the assumed-size last extent sentinel";
+
+  let description = [{
+    Returns the special extent value representing the last dimension of an
+    assumed-size array. This is used to model the semantics in FIR without
+    directly materializing the sentinel value. The concrete encoding is
+    introduced during FIR to LLVM lowering.
+
+    ```
+      %e = fir.assumed_size_extent : index
+    ```
+  }];
+
+  let results = (outs Index);
+  let assemblyFormat = "attr-dict `:` type(results)";
+}
+
+def fir_IsAssumedSizeExtentOp : fir_SimpleOp<"is_assumed_size_extent", [NoMemoryEffect]> {
+  let summary = "is value the assumed-size last extent sentinel";
+
+  let description = [{
+    Returns true iff the given integer equals the assumed-size extent sentinel.
+
+    ```
+      %t = fir.is_assumed_size_extent %v : (index) -> i1
+      %c = fir.is_assumed_size_extent %x : (i32) -> i1
+    ```
+  }];
+
+  let arguments = (ins AnyIntegerLike:$val);
+  let results = (outs BoolLike);
+  let hasCanonicalizer = 1;
+}
+
 def fir_BoxIsPtrOp : fir_SimpleOp<"box_isptr", [NoMemoryEffect]> {
   let summary = "is the boxed value a POINTER?";
 
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index 682dd82..70b9341 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -22,6 +22,7 @@
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index 0f85183..7539d12 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -13,9 +13,11 @@
 #ifndef FORTRAN_SEMANTICS_OPENMP_UTILS_H
 #define FORTRAN_SEMANTICS_OPENMP_UTILS_H
 
+#include "flang/Common/indirection.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/char-block.h"
 #include "flang/Parser/parse-tree.h"
+#include "flang/Parser/tools.h"
 #include "flang/Semantics/tools.h"
 
 #include "llvm/ADT/ArrayRef.h"
@@ -74,7 +76,11 @@ bool IsVarOrFunctionRef(const MaybeExpr &expr);
 bool IsMapEnteringType(parser::OmpMapType::Value type);
 bool IsMapExitingType(parser::OmpMapType::Value type);
 
-std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr);
+MaybeExpr GetEvaluateExpr(const parser::Expr &parserExpr);
+template <typename T> MaybeExpr GetEvaluateExpr(const T &inp) {
+  return GetEvaluateExpr(parser::UnwrapRef<parser::Expr>(inp));
+}
+
 std::optional<evaluate::DynamicType> GetDynamicType(
     const parser::Expr &parserExpr);
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 3b711cc..a516a44 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1766,7 +1766,7 @@ private:
       // to a crash due to a block with no terminator. See issue #126452.
       mlir::FunctionType funcType = builder->getFunction().getFunctionType();
       mlir::Type resultType = funcType.getResult(0);
-      mlir::Value undefResult = builder->create<fir::UndefOp>(loc, resultType);
+      mlir::Value undefResult = fir::UndefOp::create(*builder, loc, resultType);
       genExitRoutine(false, undefResult);
       return;
     }
@@ -4010,8 +4010,8 @@ private:
       // parameters and dynamic type. The selector cannot be a
       // POINTER/ALLOCATBLE as per F'2023 C1160.
       fir::ExtendedValue newExv;
-      llvm::SmallVector assumeSizeExtents{
-          builder->createMinusOneInteger(loc, builder->getIndexType())};
+      llvm::SmallVector<mlir::Value> assumeSizeExtents{
+          fir::AssumedSizeExtentOp::create(*builder, loc)};
       mlir::Value baseAddr =
           hlfir::genVariableRawAddress(loc, *builder, selector);
       const bool isVolatile = fir::isa_volatile_type(selector.getType());
@@ -4733,11 +4733,21 @@ private:
       return fir::factory::createUnallocatedBox(*builder, loc, lhsBoxType, {});
     hlfir::Entity rhs = Fortran::lower::convertExprToHLFIR(
         loc, *this, assign.rhs, localSymbols, rhsContext);
+    auto rhsBoxType = rhs.getBoxType();
     // Create pointer descriptor value from the RHS.
     if (rhs.isMutableBox())
       rhs = hlfir::Entity{fir::LoadOp::create(*builder, loc, rhs)};
-    mlir::Value rhsBox = hlfir::genVariableBox(
-        loc, *builder, rhs, lhsBoxType.getBoxTypeWithNewShape(rhs.getRank()));
+
+    // Use LHS type if LHS is not polymorphic.
+    fir::BaseBoxType targetBoxType;
+    if (assign.lhs.GetType()->IsPolymorphic())
+      targetBoxType = rhsBoxType.getBoxTypeWithNewAttr(
+          fir::BaseBoxType::Attribute::Pointer);
+    else
+      targetBoxType = lhsBoxType.getBoxTypeWithNewShape(rhs.getRank());
+    mlir::Value rhsBox =
+        hlfir::genVariableBox(loc, *builder, rhs, targetBoxType);
+
     // Apply lower bounds or reshaping if any.
     if (const auto *lbExprs =
             std::get_if<Fortran::evaluate::Assignment::BoundsSpec>(&assign.u);
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 00ec1b5..2517ab3 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -1711,7 +1711,7 @@ static void lowerExplicitLowerBounds(
 /// CFI_desc_t requirements in 18.5.3 point 5.).
 static mlir::Value getAssumedSizeExtent(mlir::Location loc,
                                         fir::FirOpBuilder &builder) {
-  return builder.createMinusOneInteger(loc, builder.getIndexType());
+  return fir::AssumedSizeExtentOp::create(builder, loc);
 }
 
 /// Lower explicit extents into \p result if this is an explicit-shape or
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index af4f420..1fc59c7 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -2366,6 +2366,23 @@ static void processDoLoopBounds(
   }
 }
 
+static void remapCommonBlockMember(
+    Fortran::lower::AbstractConverter &converter, mlir::Location loc,
+    const Fortran::semantics::Symbol &member,
+    mlir::Value newCommonBlockBaseAddress,
+    const Fortran::semantics::Symbol &commonBlockSymbol,
+    llvm::SmallPtrSetImpl<const Fortran::semantics::Symbol *> &seenSymbols) {
+  if (seenSymbols.contains(&member))
+    return;
+  mlir::Value accMemberValue = Fortran::lower::genCommonBlockMember(
+      converter, loc, member, newCommonBlockBaseAddress,
+      commonBlockSymbol.size());
+  fir::ExtendedValue hostExv = converter.getSymbolExtendedValue(member);
+  fir::ExtendedValue accExv = fir::substBase(hostExv, accMemberValue);
+  converter.bindSymbol(member, accExv);
+  seenSymbols.insert(&member);
+}
+
 /// Remap symbols that appeared in OpenACC data clauses to use the results of
 /// the corresponding data operations. This allows isolating symbol accesses
 /// inside the OpenACC region from accesses in the host and other regions while
@@ -2391,14 +2408,39 @@ static void remapDataOperandSymbols(
   builder.setInsertionPointToStart(&regionOp.getRegion().front());
   llvm::SmallPtrSet<const Fortran::semantics::Symbol *, 8> seenSymbols;
   mlir::IRMapping mapper;
+  mlir::Location loc = regionOp.getLoc();
   for (auto [value, symbol] : dataOperandSymbolPairs) {
-
-    // If A symbol appears on several data clause, just map it to the first
+    // If a symbol appears on several data clause, just map it to the first
     // result (all data operations results for a symbol are pointing same
     // memory, so it does not matter which one is used).
     if (seenSymbols.contains(&symbol.get()))
       continue;
     seenSymbols.insert(&symbol.get());
+    // When a common block appears in a directive, remap its members.
+    // Note: this will instantiate all common block members even if they are not
+    // used inside the region. If hlfir.declare DCE is not made possible, this
+    // could be improved to reduce IR noise.
+    if (const auto *commonBlock = symbol->template detailsIf<
+                                  Fortran::semantics::CommonBlockDetails>()) {
+      const Fortran::semantics::Scope &commonScope = symbol->owner();
+      if (commonScope.equivalenceSets().empty()) {
+        for (auto member : commonBlock->objects())
+          remapCommonBlockMember(converter, loc, *member, value, *symbol,
+                                 seenSymbols);
+      } else {
+        // Objects equivalenced with common block members still belong to the
+        // common block storage even if they are not part of the common block
+        // declaration. The easiest and most robust way to find all symbols
+        // belonging to the common block is to loop through the scope symbols
+        // and check if they belong to the common.
+        for (const auto &scopeSymbol : commonScope)
+          if (Fortran::semantics::FindCommonBlockContaining(
+                  *scopeSymbol.second) == &symbol.get())
+            remapCommonBlockMember(converter, loc, *scopeSymbol.second, value,
+                                   *symbol, seenSymbols);
+      }
+      continue;
+    }
     std::optional<fir::FortranVariableOpInterface> hostDef =
         symbolMap.lookupVariableDefinition(symbol);
     assert(hostDef.has_value() && llvm::isa<hlfir::DeclareOp>(*hostDef) &&
@@ -2415,10 +2457,8 @@ static void remapDataOperandSymbols(
              "box type mismatch between compute region variable and "
              "hlfir.declare input unexpected");
       if (Fortran::semantics::IsOptional(symbol))
-        TODO(regionOp.getLoc(),
-             "remapping OPTIONAL symbol in OpenACC compute region");
-      auto rawValue =
-          fir::BoxAddrOp::create(builder, regionOp.getLoc(), hostType, value);
+        TODO(loc, "remapping OPTIONAL symbol in OpenACC compute region");
+      auto rawValue = fir::BoxAddrOp::create(builder, loc, hostType, value);
       mapper.map(hostInput, rawValue);
     } else {
       assert(!llvm::isa<fir::BaseBoxType>(hostType) &&
@@ -2430,8 +2470,7 @@ static void remapDataOperandSymbols(
       assert(fir::isa_ref_type(hostType) && fir::isa_ref_type(computeType) &&
              "compute region variable and host variable should both be raw "
              "addresses");
-      mlir::Value cast =
-          builder.createConvert(regionOp.getLoc(), hostType, value);
+      mlir::Value cast = builder.createConvert(loc, hostType, value);
       mapper.map(hostInput, cast);
     }
     if (mlir::Value dummyScope = hostDeclare.getDummyScope()) {
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index a49961c..7106728 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2059,37 +2059,38 @@ static void genCanonicalLoopNest(
     // Start lowering
     mlir::Value zero = firOpBuilder.createIntegerConstant(loc, loopVarType, 0);
     mlir::Value one = firOpBuilder.createIntegerConstant(loc, loopVarType, 1);
-    mlir::Value isDownwards = firOpBuilder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero);
+    mlir::Value isDownwards = mlir::arith::CmpIOp::create(
+        firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero);
 
     // Ensure we are counting upwards. If not, negate step and swap lb and ub.
     mlir::Value negStep =
-        firOpBuilder.create<mlir::arith::SubIOp>(loc, zero, loopStepVar);
-    mlir::Value incr = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isDownwards, negStep, loopStepVar);
-    mlir::Value lb = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isDownwards, loopUBVar, loopLBVar);
-    mlir::Value ub = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isDownwards, loopLBVar, loopUBVar);
+        mlir::arith::SubIOp::create(firOpBuilder, loc, zero, loopStepVar);
+    mlir::Value incr = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isDownwards, negStep, loopStepVar);
+    mlir::Value lb = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isDownwards, loopUBVar, loopLBVar);
+    mlir::Value ub = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isDownwards, loopLBVar, loopUBVar);
 
     // Compute the trip count assuming lb <= ub. This guarantees that the result
     // is non-negative and we can use unsigned arithmetic.
-    mlir::Value span = firOpBuilder.create<mlir::arith::SubIOp>(
-        loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw);
+    mlir::Value span = mlir::arith::SubIOp::create(
+        firOpBuilder, loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw);
     mlir::Value tcMinusOne =
-        firOpBuilder.create<mlir::arith::DivUIOp>(loc, span, incr);
-    mlir::Value tcIfLooping = firOpBuilder.create<mlir::arith::AddIOp>(
-        loc, tcMinusOne, one, ::mlir::arith::IntegerOverflowFlags::nuw);
+        mlir::arith::DivUIOp::create(firOpBuilder, loc, span, incr);
+    mlir::Value tcIfLooping =
+        mlir::arith::AddIOp::create(firOpBuilder, loc, tcMinusOne, one,
+                                    ::mlir::arith::IntegerOverflowFlags::nuw);
 
     // Fall back to 0 if lb > ub
-    mlir::Value isZeroTC = firOpBuilder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, ub, lb);
-    mlir::Value tripcount = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isZeroTC, zero, tcIfLooping);
+    mlir::Value isZeroTC = mlir::arith::CmpIOp::create(
+        firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, ub, lb);
+    mlir::Value tripcount = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isZeroTC, zero, tcIfLooping);
     tripcounts.push_back(tripcount);
 
     // Create the CLI handle.
-    auto newcli = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    auto newcli = mlir::omp::NewCliOp::create(firOpBuilder, loc);
     mlir::Value cli = newcli.getResult();
     clis.push_back(cli);
 
@@ -2122,10 +2123,10 @@ static void genCanonicalLoopNest(
                "Expecting all block args to have been collected by now");
         for (auto j : llvm::seq<size_t>(numLoops)) {
           mlir::Value natIterNum = fir::getBase(blockArgs[j]);
-          mlir::Value scaled = firOpBuilder.create<mlir::arith::MulIOp>(
-              loc, natIterNum, loopStepVars[j]);
-          mlir::Value userVal = firOpBuilder.create<mlir::arith::AddIOp>(
-              loc, loopLBVars[j], scaled);
+          mlir::Value scaled = mlir::arith::MulIOp::create(
+              firOpBuilder, loc, natIterNum, loopStepVars[j]);
+          mlir::Value userVal = mlir::arith::AddIOp::create(
+              firOpBuilder, loc, loopLBVars[j], scaled);
 
           mlir::OpBuilder::InsertPoint insPt =
               firOpBuilder.saveInsertionPoint();
@@ -2198,9 +2199,9 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter,
   gridGeneratees.reserve(numLoops);
   intratileGeneratees.reserve(numLoops);
   for ([[maybe_unused]] auto i : llvm::seq<int>(0, sizesClause.sizes.size())) {
-    auto gridCLI = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    auto gridCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc);
     gridGeneratees.push_back(gridCLI.getResult());
-    auto intratileCLI = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    auto intratileCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc);
     intratileGeneratees.push_back(intratileCLI.getResult());
   }
 
@@ -2209,8 +2210,8 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter,
   generatees.append(gridGeneratees);
   generatees.append(intratileGeneratees);
 
-  firOpBuilder.create<mlir::omp::TileOp>(loc, generatees, applyees,
-                                         sizesClause.sizes);
+  mlir::omp::TileOp::create(firOpBuilder, loc, generatees, applyees,
+                            sizesClause.sizes);
 }
 
 static void genUnrollOp(Fortran::lower::AbstractConverter &converter,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 70bb43a2..478ab15 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -749,6 +749,44 @@ struct VolatileCastOpConversion
   }
 };
 
+/// Lower `fir.assumed_size_extent` to constant -1 of index type.
+struct AssumedSizeExtentOpConversion
+    : public fir::FIROpConversion<fir::AssumedSizeExtentOp> {
+  using FIROpConversion::FIROpConversion;
+
+  llvm::LogicalResult
+  matchAndRewrite(fir::AssumedSizeExtentOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::Location loc = op.getLoc();
+    mlir::Type ity = lowerTy().indexType();
+    auto cst = fir::genConstantIndex(loc, ity, rewriter, -1);
+    rewriter.replaceOp(op, cst.getResult());
+    return mlir::success();
+  }
+};
+
+/// Lower `fir.is_assumed_size_extent` to integer equality with -1.
+struct IsAssumedSizeExtentOpConversion
+    : public fir::FIROpConversion<fir::IsAssumedSizeExtentOp> {
+  using FIROpConversion::FIROpConversion;
+
+  llvm::LogicalResult
+  matchAndRewrite(fir::IsAssumedSizeExtentOp op, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::Location loc = op.getLoc();
+    mlir::Value val = adaptor.getVal();
+    mlir::Type valTy = val.getType();
+    // Create constant -1 of the operand type.
+    auto negOneAttr = rewriter.getIntegerAttr(valTy, -1);
+    auto negOne =
+        mlir::LLVM::ConstantOp::create(rewriter, loc, valTy, negOneAttr);
+    auto cmp = mlir::LLVM::ICmpOp::create(
+        rewriter, loc, mlir::LLVM::ICmpPredicate::eq, val, negOne);
+    rewriter.replaceOp(op, cmp.getResult());
+    return mlir::success();
+  }
+};
+
 /// convert value of from-type to value of to-type
 struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
   using FIROpConversion::FIROpConversion;
@@ -1113,7 +1151,7 @@ struct AllocMemOpConversion : public fir::FIROpConversion<fir::AllocMemOp> {
     mlir::Value size = genTypeSizeInBytes(loc, ity, rewriter, llvmObjectTy);
     if (auto scaleSize =
             fir::genAllocationScaleSize(loc, heap.getInType(), ity, rewriter))
-      size = rewriter.create<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
+      size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize);
     for (mlir::Value opnd : adaptor.getOperands())
       size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size,
                                        integerCast(loc, rewriter, ity, opnd));
@@ -4360,6 +4398,7 @@ void fir::populateFIRToLLVMConversionPatterns(
       AllocaOpConversion, AllocMemOpConversion, BoxAddrOpConversion,
       BoxCharLenOpConversion, BoxDimsOpConversion, BoxEleSizeOpConversion,
       BoxIsAllocOpConversion, BoxIsArrayOpConversion, BoxIsPtrOpConversion,
+      AssumedSizeExtentOpConversion, IsAssumedSizeExtentOpConversion,
       BoxOffsetOpConversion, BoxProcHostOpConversion, BoxRankOpConversion,
       BoxTypeCodeOpConversion, BoxTypeDescOpConversion, CallOpConversion,
       CmpcOpConversion, VolatileCastOpConversion, ConvertOpConversion,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
index 381b2a2..f74d635 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
@@ -242,10 +242,11 @@ struct TargetAllocMemOpConversion
         loc, llvmObjectTy, ity, rewriter, lowerTy().getDataLayout());
     if (auto scaleSize = fir::genAllocationScaleSize(
             loc, allocmemOp.getInType(), ity, rewriter))
-      size = rewriter.create<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
+      size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize);
     for (mlir::Value opnd : adaptor.getOperands().drop_front())
-      size = rewriter.create<mlir::LLVM::MulOp>(
-          loc, ity, size, integerCast(lowerTy(), loc, rewriter, ity, opnd));
+      size = mlir::LLVM::MulOp::create(
+          rewriter, loc, ity, size,
+          integerCast(lowerTy(), loc, rewriter, ity, opnd));
     auto mallocTyWidth = lowerTy().getIndexTypeBitwidth();
     auto mallocTy =
         mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth);
diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
index ac285b5..0776346 100644
--- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
@@ -872,6 +872,14 @@ public:
       }
     }
 
+    // Count the number of arguments that have to stay in place at the end of
+    // the argument list.
+    unsigned trailingArgs = 0;
+    if constexpr (std::is_same_v<FuncOpTy, mlir::gpu::GPUFuncOp>) {
+      trailingArgs =
+          func.getNumWorkgroupAttributions() + func.getNumPrivateAttributions();
+    }
+
     // Convert return value(s)
     for (auto ty : funcTy.getResults())
       llvm::TypeSwitch<mlir::Type>(ty)
@@ -981,6 +989,16 @@ public:
       }
     }
 
+    // Add the argument at the end if the number of trailing arguments is 0,
+    // otherwise insert the argument at the appropriate index.
+    auto addOrInsertArgument = [&](mlir::Type ty, mlir::Location loc) {
+      unsigned inputIndex = func.front().getArguments().size() - trailingArgs;
+      auto newArg = trailingArgs == 0
+                        ? func.front().addArgument(ty, loc)
+                        : func.front().insertArgument(inputIndex, ty, loc);
+      return newArg;
+    };
+
     if (!func.empty()) {
       // If the function has a body, then apply the fixups to the arguments and
       // return ops as required. These fixups are done in place.
@@ -1117,8 +1135,7 @@ public:
           // original arguments. (Boxchar arguments.)
           auto newBufArg =
               func.front().insertArgument(fixup.index, fixupType, loc);
-          auto newLenArg =
-              func.front().addArgument(trailingTys[fixup.second], loc);
+          auto newLenArg = addOrInsertArgument(trailingTys[fixup.second], loc);
           auto boxTy = oldArgTys[fixup.index - offset];
           rewriter->setInsertionPointToStart(&func.front());
           auto box = fir::EmboxCharOp::create(*rewriter, loc, boxTy, newBufArg,
@@ -1133,8 +1150,7 @@ public:
           // appended after all the original arguments.
           auto newProcPointerArg =
               func.front().insertArgument(fixup.index, fixupType, loc);
-          auto newLenArg =
-              func.front().addArgument(trailingTys[fixup.second], loc);
+          auto newLenArg = addOrInsertArgument(trailingTys[fixup.second], loc);
           auto tupleType = oldArgTys[fixup.index - offset];
           rewriter->setInsertionPointToStart(&func.front());
           fir::FirOpBuilder builder(*rewriter, getModule());
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 1712af1..d0164f3 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -5143,6 +5143,34 @@ void fir::BoxTotalElementsOp::getCanonicalizationPatterns(
 }
 
 //===----------------------------------------------------------------------===//
+// IsAssumedSizeExtentOp and AssumedSizeExtentOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct FoldIsAssumedSizeExtentOnCtor
+    : public mlir::OpRewritePattern<fir::IsAssumedSizeExtentOp> {
+  using mlir::OpRewritePattern<fir::IsAssumedSizeExtentOp>::OpRewritePattern;
+  mlir::LogicalResult
+  matchAndRewrite(fir::IsAssumedSizeExtentOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    if (llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(
+            op.getVal().getDefiningOp())) {
+      mlir::Type i1 = rewriter.getI1Type();
+      rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(
+          op, i1, rewriter.getIntegerAttr(i1, 1));
+      return mlir::success();
+    }
+    return mlir::failure();
+  }
+};
+} // namespace
+
+void fir::IsAssumedSizeExtentOp::getCanonicalizationPatterns(
+    mlir::RewritePatternSet &patterns, mlir::MLIRContext *context) {
+  patterns.add<FoldIsAssumedSizeExtentOnCtor>(context);
+}
+
+//===----------------------------------------------------------------------===//
 // LocalitySpecifierOp
 //===----------------------------------------------------------------------===//
 
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
index 4840a99..0d135a9 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
@@ -39,13 +39,13 @@ public:
 
   static mlir::Operation *load(mlir::OpBuilder &builder, mlir::Location loc,
                                mlir::Value value) {
-    return builder.create<fir::LoadOp>(loc, value);
+    return fir::LoadOp::create(builder, loc, value);
   }
 
   static mlir::Value placeInMemory(mlir::OpBuilder &builder, mlir::Location loc,
                                    mlir::Value value) {
-    auto alloca = builder.create<fir::AllocaOp>(loc, value.getType());
-    builder.create<fir::StoreOp>(loc, value, alloca);
+    auto alloca = fir::AllocaOp::create(builder, loc, value.getType());
+    fir::StoreOp::create(builder, loc, value, alloca);
     return alloca;
   }
 };
diff --git a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
index 817434f..5793d46 100644
--- a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
+++ b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
@@ -130,8 +130,8 @@ class AutomapToTargetDataPass
           builder.getBoolAttr(false));
       clauses.mapVars.push_back(mapInfo);
       isa<fir::StoreOp>(memOp)
-          ? builder.create<omp::TargetEnterDataOp>(memOp.getLoc(), clauses)
-          : builder.create<omp::TargetExitDataOp>(memOp.getLoc(), clauses);
+          ? omp::TargetEnterDataOp::create(builder, memOp.getLoc(), clauses)
+          : omp::TargetExitDataOp::create(builder, memOp.getLoc(), clauses);
     };
 
     for (fir::GlobalOp globalOp : automapGlobals) {
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 65a23be..1229018 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -595,7 +595,7 @@ private:
               mlir::omp::TargetOperands &clauseOps,
               mlir::omp::LoopNestOperands &loopNestClauseOps,
               const LiveInShapeInfoMap &liveInShapeInfoMap) const {
-    auto targetOp = rewriter.create<mlir::omp::TargetOp>(loc, clauseOps);
+    auto targetOp = mlir::omp::TargetOp::create(rewriter, loc, clauseOps);
     auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);
 
     mlir::Region &region = targetOp.getRegion();
@@ -672,7 +672,7 @@ private:
     // temporary.
     Fortran::utils::openmp::cloneOrMapRegionOutsiders(builder, targetOp);
     rewriter.setInsertionPoint(
-        rewriter.create<mlir::omp::TerminatorOp>(targetOp.getLoc()));
+        mlir::omp::TerminatorOp::create(rewriter, targetOp.getLoc()));
 
     return targetOp;
   }
@@ -715,8 +715,8 @@ private:
 
         auto shapeShiftType = fir::ShapeShiftType::get(
             builder.getContext(), shapeShiftOperands.size() / 2);
-        return builder.create<fir::ShapeShiftOp>(
-            liveInArg.getLoc(), shapeShiftType, shapeShiftOperands);
+        return fir::ShapeShiftOp::create(builder, liveInArg.getLoc(),
+                                         shapeShiftType, shapeShiftOperands);
       }
 
       llvm::SmallVector<mlir::Value> shapeOperands;
@@ -728,11 +728,11 @@ private:
         ++shapeIdx;
       }
 
-      return builder.create<fir::ShapeOp>(liveInArg.getLoc(), shapeOperands);
+      return fir::ShapeOp::create(builder, liveInArg.getLoc(), shapeOperands);
     }();
 
-    return builder.create<hlfir::DeclareOp>(liveInArg.getLoc(), liveInArg,
-                                            liveInName, shape);
+    return hlfir::DeclareOp::create(builder, liveInArg.getLoc(), liveInArg,
+                                    liveInName, shape);
   }
 
   mlir::omp::TeamsOp genTeamsOp(mlir::ConversionPatternRewriter &rewriter,
@@ -742,13 +742,13 @@ private:
     genReductions(rewriter, mapper, loop, teamsOps);
 
     mlir::Location loc = loop.getLoc();
-    auto teamsOp = rewriter.create<mlir::omp::TeamsOp>(loc, teamsOps);
+    auto teamsOp = mlir::omp::TeamsOp::create(rewriter, loc, teamsOps);
     Fortran::common::openmp::EntryBlockArgs teamsArgs;
     teamsArgs.reduction.vars = teamsOps.reductionVars;
     Fortran::common::openmp::genEntryBlock(rewriter, teamsArgs,
                                            teamsOp.getRegion());
 
-    rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+    rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
 
     for (auto [loopVar, teamsArg] : llvm::zip_equal(
              loop.getReduceVars(), teamsOp.getRegion().getArguments())) {
@@ -761,8 +761,8 @@ private:
   mlir::omp::DistributeOp
   genDistributeOp(mlir::Location loc,
                   mlir::ConversionPatternRewriter &rewriter) const {
-    auto distOp = rewriter.create<mlir::omp::DistributeOp>(
-        loc, /*clauses=*/mlir::omp::DistributeOperands{});
+    auto distOp = mlir::omp::DistributeOp::create(
+        rewriter, loc, /*clauses=*/mlir::omp::DistributeOperands{});
 
     rewriter.createBlock(&distOp.getRegion());
     return distOp;
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
index 8a9b383..7b61539 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
@@ -282,14 +282,14 @@ fissionWorkdistribute(omp::WorkdistributeOp workdistribute) {
           &newTeams.getRegion(), newTeams.getRegion().begin(), {}, {});
       for (auto arg : teamsBlock->getArguments())
         newTeamsBlock->addArgument(arg.getType(), arg.getLoc());
-      auto newWorkdistribute = rewriter.create<omp::WorkdistributeOp>(loc);
-      rewriter.create<omp::TerminatorOp>(loc);
+      auto newWorkdistribute = omp::WorkdistributeOp::create(rewriter, loc);
+      omp::TerminatorOp::create(rewriter, loc);
       rewriter.createBlock(&newWorkdistribute.getRegion(),
                            newWorkdistribute.getRegion().begin(), {}, {});
       auto *cloned = rewriter.clone(*parallelize);
       parallelize->replaceAllUsesWith(cloned);
       parallelize->erase();
-      rewriter.create<omp::TerminatorOp>(loc);
+      omp::TerminatorOp::create(rewriter, loc);
       changed = true;
     }
   }
@@ -298,10 +298,10 @@ fissionWorkdistribute(omp::WorkdistributeOp workdistribute) {
 
 /// Generate omp.parallel operation with an empty region.
 static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) {
-  auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
+  auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc);
   parallelOp.setComposite(composite);
   rewriter.createBlock(&parallelOp.getRegion());
-  rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+  rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
   return;
 }
 
@@ -309,7 +309,7 @@ static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) {
 static void genDistributeOp(Location loc, OpBuilder &rewriter, bool composite) {
   mlir::omp::DistributeOperands distributeClauseOps;
   auto distributeOp =
-      rewriter.create<mlir::omp::DistributeOp>(loc, distributeClauseOps);
+      mlir::omp::DistributeOp::create(rewriter, loc, distributeClauseOps);
   distributeOp.setComposite(composite);
   auto distributeBlock = rewriter.createBlock(&distributeOp.getRegion());
   rewriter.setInsertionPointToStart(distributeBlock);
@@ -334,12 +334,12 @@ static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop,
                         const mlir::omp::LoopNestOperands &clauseOps,
                         bool composite) {
 
-  auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
+  auto wsloopOp = mlir::omp::WsloopOp::create(rewriter, doLoop.getLoc());
   wsloopOp.setComposite(composite);
   rewriter.createBlock(&wsloopOp.getRegion());
 
   auto loopNestOp =
-      rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
+      mlir::omp::LoopNestOp::create(rewriter, doLoop.getLoc(), clauseOps);
 
   // Clone the loop's body inside the loop nest construct using the
   // mapped values.
@@ -351,7 +351,7 @@ static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop,
   // Erase fir.result op of do loop and create yield op.
   if (auto resultOp = dyn_cast<fir::ResultOp>(terminatorOp)) {
     rewriter.setInsertionPoint(terminatorOp);
-    rewriter.create<mlir::omp::YieldOp>(doLoop->getLoc());
+    mlir::omp::YieldOp::create(rewriter, doLoop->getLoc());
     terminatorOp->erase();
   }
 }
@@ -494,15 +494,15 @@ static SmallVector<Value> convertFlatToMultiDim(OpBuilder &builder,
   // Convert flat index to multi-dimensional indices
   SmallVector<Value> indices(rank);
   Value temp = flatIdx;
-  auto c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  auto c1 = arith::ConstantIndexOp::create(builder, loc, 1);
 
   // Work backwards through dimensions (row-major order)
   for (int i = rank - 1; i >= 0; --i) {
-    Value zeroBasedIdx = builder.create<arith::RemSIOp>(loc, temp, extents[i]);
+    Value zeroBasedIdx = arith::RemSIOp::create(builder, loc, temp, extents[i]);
     // Convert to one-based index
-    indices[i] = builder.create<arith::AddIOp>(loc, zeroBasedIdx, c1);
+    indices[i] = arith::AddIOp::create(builder, loc, zeroBasedIdx, c1);
     if (i > 0) {
-      temp = builder.create<arith::DivSIOp>(loc, temp, extents[i]);
+      temp = arith::DivSIOp::create(builder, loc, temp, extents[i]);
     }
   }
 
@@ -525,7 +525,7 @@ static Value CalculateTotalElements(OpBuilder &builder, Location loc,
     if (i == 0) {
       totalElems = extent;
     } else {
-      totalElems = builder.create<arith::MulIOp>(loc, totalElems, extent);
+      totalElems = arith::MulIOp::create(builder, loc, totalElems, extent);
     }
   }
   return totalElems;
@@ -562,14 +562,14 @@ static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc,
   // Load destination array box (if it's a reference)
   Value arrayBox = destBox;
   if (isa<fir::ReferenceType>(destBox.getType()))
-    arrayBox = builder.create<fir::LoadOp>(loc, destBox);
+    arrayBox = fir::LoadOp::create(builder, loc, destBox);
 
-  auto scalarValue = builder.create<fir::BoxAddrOp>(loc, srcBox);
-  Value scalar = builder.create<fir::LoadOp>(loc, scalarValue);
+  auto scalarValue = fir::BoxAddrOp::create(builder, loc, srcBox);
+  Value scalar = fir::LoadOp::create(builder, loc, scalarValue);
 
   // Calculate total number of elements (flattened)
-  auto c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
-  auto c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  auto c0 = arith::ConstantIndexOp::create(builder, loc, 0);
+  auto c1 = arith::ConstantIndexOp::create(builder, loc, 1);
   Value totalElems = CalculateTotalElements(builder, loc, arrayBox);
 
   auto *workdistributeBlock = &workdistribute.getRegion().front();
@@ -587,7 +587,7 @@ static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc,
       builder, loc, fir::ReferenceType::get(scalar.getType()), arrayBox,
       nullptr, nullptr, ValueRange{indices}, ValueRange{});
 
-  builder.create<fir::StoreOp>(loc, scalar, elemPtr);
+  fir::StoreOp::create(builder, loc, scalar, elemPtr);
 }
 
 /// workdistributeRuntimeCallLower method finds the runtime calls
@@ -749,14 +749,15 @@ FailureOr<omp::TargetOp> splitTargetData(omp::TargetOp targetOp,
   auto deviceAddrVars = targetOp.getHasDeviceAddrVars();
   auto devicePtrVars = targetOp.getIsDevicePtrVars();
   // Create the target data op
-  auto targetDataOp = rewriter.create<omp::TargetDataOp>(
-      loc, device, ifExpr, outerMapInfos, deviceAddrVars, devicePtrVars);
+  auto targetDataOp =
+      omp::TargetDataOp::create(rewriter, loc, device, ifExpr, outerMapInfos,
+                                deviceAddrVars, devicePtrVars);
   auto taregtDataBlock = rewriter.createBlock(&targetDataOp.getRegion());
-  rewriter.create<mlir::omp::TerminatorOp>(loc);
+  mlir::omp::TerminatorOp::create(rewriter, loc);
   rewriter.setInsertionPointToStart(taregtDataBlock);
   // Create the inner target op
-  auto newTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  auto newTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(),
@@ -821,19 +822,19 @@ static TempOmpVar allocateTempOmpVar(Location loc, Type ty,
   // Get the appropriate type for allocation
   if (isPtr(ty)) {
     Type intTy = rewriter.getI32Type();
-    auto one = rewriter.create<LLVM::ConstantOp>(loc, intTy, 1);
+    auto one = LLVM::ConstantOp::create(rewriter, loc, intTy, 1);
     allocType = llvmPtrTy;
-    alloc = rewriter.create<LLVM::AllocaOp>(loc, llvmPtrTy, allocType, one);
+    alloc = LLVM::AllocaOp::create(rewriter, loc, llvmPtrTy, allocType, one);
     allocType = intTy;
   } else {
     allocType = ty;
-    alloc = rewriter.create<fir::AllocaOp>(loc, allocType);
+    alloc = fir::AllocaOp::create(rewriter, loc, allocType);
   }
   // Lambda to create mapinfo ops
   auto getMapInfo = [&](mlir::omp::ClauseMapFlags mappingFlags,
                         const char *name) {
-    return rewriter.create<omp::MapInfoOp>(
-        loc, alloc.getType(), alloc, TypeAttr::get(allocType),
+    return omp::MapInfoOp::create(
+        rewriter, loc, alloc.getType(), alloc, TypeAttr::get(allocType),
         rewriter.getAttr<omp::ClauseMapFlagsAttr>(mappingFlags),
         rewriter.getAttr<omp::VariableCaptureKindAttr>(
             omp::VariableCaptureKind::ByRef),
@@ -979,12 +980,12 @@ static void reloadCacheAndRecompute(
     // If the original value is a pointer or reference, load and convert if
     // necessary.
     if (isPtr(original.getType())) {
-      restored = rewriter.create<LLVM::LoadOp>(loc, llvmPtrTy, newArg);
+      restored = LLVM::LoadOp::create(rewriter, loc, llvmPtrTy, newArg);
       if (!isa<LLVM::LLVMPointerType>(original.getType()))
         restored =
-            rewriter.create<fir::ConvertOp>(loc, original.getType(), restored);
+            fir::ConvertOp::create(rewriter, loc, original.getType(), restored);
     } else {
-      restored = rewriter.create<fir::LoadOp>(loc, newArg);
+      restored = fir::LoadOp::create(rewriter, loc, newArg);
     }
     irMapping.map(original, restored);
   }
@@ -1053,7 +1054,7 @@ static mlir::LLVM::ConstantOp
 genI32Constant(mlir::Location loc, mlir::RewriterBase &rewriter, int value) {
   mlir::Type i32Ty = rewriter.getI32Type();
   mlir::IntegerAttr attr = rewriter.getI32IntegerAttr(value);
-  return rewriter.create<mlir::LLVM::ConstantOp>(loc, i32Ty, attr);
+  return mlir::LLVM::ConstantOp::create(rewriter, loc, i32Ty, attr);
 }
 
 /// Given a box descriptor, extract the base address of the data it describes.
@@ -1230,8 +1231,8 @@ static void genFortranAssignOmpReplacement(fir::FirOpBuilder &builder,
       genOmpGetMappedPtrIfPresent(builder, loc, destBase, device, module);
   Value srcPtr =
       genOmpGetMappedPtrIfPresent(builder, loc, srcBase, device, module);
-  Value zero = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
-                                                builder.getI64IntegerAttr(0));
+  Value zero = LLVM::ConstantOp::create(builder, loc, builder.getI64Type(),
+                                        builder.getI64IntegerAttr(0));
 
   // Generate the call to omp_target_memcpy to perform the data copy on the
   // device.
@@ -1348,23 +1349,24 @@ static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter,
   for (Operation *op : opsToReplace) {
     if (auto allocOp = dyn_cast<fir::AllocMemOp>(op)) {
       rewriter.setInsertionPoint(allocOp);
-      auto ompAllocmemOp = rewriter.create<omp::TargetAllocMemOp>(
-          allocOp.getLoc(), rewriter.getI64Type(), device,
+      auto ompAllocmemOp = omp::TargetAllocMemOp::create(
+          rewriter, allocOp.getLoc(), rewriter.getI64Type(), device,
           allocOp.getInTypeAttr(), allocOp.getUniqNameAttr(),
           allocOp.getBindcNameAttr(), allocOp.getTypeparams(),
           allocOp.getShape());
-      auto firConvertOp = rewriter.create<fir::ConvertOp>(
-          allocOp.getLoc(), allocOp.getResult().getType(),
-          ompAllocmemOp.getResult());
+      auto firConvertOp = fir::ConvertOp::create(rewriter, allocOp.getLoc(),
+                                                 allocOp.getResult().getType(),
+                                                 ompAllocmemOp.getResult());
       rewriter.replaceOp(allocOp, firConvertOp.getResult());
     }
     // Replace fir.freemem with omp.target_freemem.
     else if (auto freeOp = dyn_cast<fir::FreeMemOp>(op)) {
       rewriter.setInsertionPoint(freeOp);
-      auto firConvertOp = rewriter.create<fir::ConvertOp>(
-          freeOp.getLoc(), rewriter.getI64Type(), freeOp.getHeapref());
-      rewriter.create<omp::TargetFreeMemOp>(freeOp.getLoc(), device,
-                                            firConvertOp.getResult());
+      auto firConvertOp =
+          fir::ConvertOp::create(rewriter, freeOp.getLoc(),
+                                 rewriter.getI64Type(), freeOp.getHeapref());
+      omp::TargetFreeMemOp::create(rewriter, freeOp.getLoc(), device,
+                                   firConvertOp.getResult());
       rewriter.eraseOp(freeOp);
     }
     // fir.declare changes its type when hoisting it out of omp.target to
@@ -1376,8 +1378,9 @@ static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter,
           dyn_cast<fir::ReferenceType>(clonedInType);
       Type clonedEleTy = clonedRefType.getElementType();
       rewriter.setInsertionPoint(op);
-      Value loadedValue = rewriter.create<fir::LoadOp>(
-          clonedDeclareOp.getLoc(), clonedEleTy, clonedDeclareOp.getMemref());
+      Value loadedValue =
+          fir::LoadOp::create(rewriter, clonedDeclareOp.getLoc(), clonedEleTy,
+                              clonedDeclareOp.getMemref());
       clonedDeclareOp.getResult().replaceAllUsesWith(loadedValue);
     }
     // Replace runtime calls with omp versions.
@@ -1473,8 +1476,8 @@ genPreTargetOp(omp::TargetOp targetOp, SmallVector<Value> &preMapOperands,
   auto *targetBlock = &targetOp.getRegion().front();
   SmallVector<Value> preHostEvalVars{targetOp.getHostEvalVars()};
   // update the hostEvalVars of preTargetOp
-  omp::TargetOp preTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  omp::TargetOp preTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), preHostEvalVars,
@@ -1513,13 +1516,13 @@ genPreTargetOp(omp::TargetOp targetOp, SmallVector<Value> &preMapOperands,
     // Create the store operation.
     if (isPtr(originalResult.getType())) {
       if (!isa<LLVM::LLVMPointerType>(toStore.getType()))
-        toStore = rewriter.create<fir::ConvertOp>(loc, llvmPtrTy, toStore);
-      rewriter.create<LLVM::StoreOp>(loc, toStore, newArg);
+        toStore = fir::ConvertOp::create(rewriter, loc, llvmPtrTy, toStore);
+      LLVM::StoreOp::create(rewriter, loc, toStore, newArg);
     } else {
-      rewriter.create<fir::StoreOp>(loc, toStore, newArg);
+      fir::StoreOp::create(rewriter, loc, toStore, newArg);
     }
   }
-  rewriter.create<omp::TerminatorOp>(loc);
+  omp::TerminatorOp::create(rewriter, loc);
 
   // Update hostEvalVars with the mapped values for the loop bounds if we have
   // a loopNestOp and we are not generating code for the target device.
@@ -1563,8 +1566,8 @@ genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector<Value> &postMapOperands,
                                 hostEvalVars.steps.end());
   }
   // Create the isolated target op
-  omp::TargetOp isolatedTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  omp::TargetOp isolatedTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(),
@@ -1590,7 +1593,7 @@ genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector<Value> &postMapOperands,
 
   // Clone the original operations.
   rewriter.clone(*splitBeforeOp, isolatedMapping);
-  rewriter.create<omp::TerminatorOp>(loc);
+  omp::TerminatorOp::create(rewriter, loc);
 
   // update the loop bounds in the isolatedTargetOp if we have host_eval vars
   // and we are not generating code for the target device.
@@ -1643,8 +1646,8 @@ static omp::TargetOp genPostTargetOp(omp::TargetOp targetOp,
   auto *targetBlock = &targetOp.getRegion().front();
   SmallVector<Value> postHostEvalVars{targetOp.getHostEvalVars()};
   // Create the post target op
-  omp::TargetOp postTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  omp::TargetOp postTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), postHostEvalVars,
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index 566e88b..bd07d7f 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -883,18 +883,16 @@ class MapInfoFinalizationPass
     if (explicitMappingPresent(op, targetDataOp))
       return;
 
-    mlir::omp::MapInfoOp newDescParentMapOp =
-        builder.create<mlir::omp::MapInfoOp>(
-            op->getLoc(), op.getResult().getType(), op.getVarPtr(),
-            op.getVarTypeAttr(),
-            builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
-                mlir::omp::ClauseMapFlags::to |
-                mlir::omp::ClauseMapFlags::always),
-            op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{},
-            mlir::SmallVector<mlir::Value>{}, mlir::ArrayAttr{},
-            /*bounds=*/mlir::SmallVector<mlir::Value>{},
-            /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
-            /*partial_map=*/builder.getBoolAttr(false));
+    mlir::omp::MapInfoOp newDescParentMapOp = mlir::omp::MapInfoOp::create(
+        builder, op->getLoc(), op.getResult().getType(), op.getVarPtr(),
+        op.getVarTypeAttr(),
+        builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
+            mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::always),
+        op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{},
+        mlir::SmallVector<mlir::Value>{}, mlir::ArrayAttr{},
+        /*bounds=*/mlir::SmallVector<mlir::Value>{},
+        /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
+        /*partial_map=*/builder.getBoolAttr(false));
 
     targetDataOp.getMapVarsMutable().append({newDescParentMapOp});
   }
@@ -946,14 +944,13 @@ class MapInfoFinalizationPass
     // need to see how well this alteration works.
     auto loadBaseAddr =
         builder.loadIfRef(op->getLoc(), baseAddr.getVarPtrPtr());
-    mlir::omp::MapInfoOp newBaseAddrMapOp =
-        builder.create<mlir::omp::MapInfoOp>(
-            op->getLoc(), loadBaseAddr.getType(), loadBaseAddr,
-            baseAddr.getVarTypeAttr(), baseAddr.getMapTypeAttr(),
-            baseAddr.getMapCaptureTypeAttr(), mlir::Value{}, members,
-            membersAttr, baseAddr.getBounds(),
-            /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
-            /*partial_map=*/builder.getBoolAttr(false));
+    mlir::omp::MapInfoOp newBaseAddrMapOp = mlir::omp::MapInfoOp::create(
+        builder, op->getLoc(), loadBaseAddr.getType(), loadBaseAddr,
+        baseAddr.getVarTypeAttr(), baseAddr.getMapTypeAttr(),
+        baseAddr.getMapCaptureTypeAttr(), mlir::Value{}, members, membersAttr,
+        baseAddr.getBounds(),
+        /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
+        /*partial_map=*/builder.getBoolAttr(false));
     op.replaceAllUsesWith(newBaseAddrMapOp.getResult());
     op->erase();
     baseAddr.erase();
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 6dae39b..103e736 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -426,6 +426,12 @@ void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
 
   // Add codegen pass pipeline.
   fir::createDefaultFIRCodeGenPassPipeline(pm, config, inputFilename);
+
+  // Run a pass to prepare for translation of delayed privatization in the
+  // context of deferred target tasks.
+  addPassConditionally(pm, disableFirToLlvmIr, [&]() {
+    return mlir::omp::createPrepareForOMPOffloadPrivatizationPass();
+  });
 }
 
 } // namespace fir
diff --git a/flang/lib/Optimizer/Support/Utils.cpp b/flang/lib/Optimizer/Support/Utils.cpp
index 92390e4a..2f33d89 100644
--- a/flang/lib/Optimizer/Support/Utils.cpp
+++ b/flang/lib/Optimizer/Support/Utils.cpp
@@ -66,7 +66,7 @@ fir::genConstantIndex(mlir::Location loc, mlir::Type ity,
                       mlir::ConversionPatternRewriter &rewriter,
                       std::int64_t offset) {
   auto cattr = rewriter.getI64IntegerAttr(offset);
-  return rewriter.create<mlir::LLVM::ConstantOp>(loc, ity, cattr);
+  return mlir::LLVM::ConstantOp::create(rewriter, loc, ity, cattr);
 }
 
 mlir::Value
@@ -125,9 +125,9 @@ mlir::Value fir::integerCast(const fir::LLVMTypeConverter &converter,
       return rewriter.createOrFold<mlir::LLVM::SExtOp>(loc, ty, val);
   } else {
     if (toSize < fromSize)
-      return rewriter.create<mlir::LLVM::TruncOp>(loc, ty, val);
+      return mlir::LLVM::TruncOp::create(rewriter, loc, ty, val);
     if (toSize > fromSize)
-      return rewriter.create<mlir::LLVM::SExtOp>(loc, ty, val);
+      return mlir::LLVM::SExtOp::create(rewriter, loc, ty, val);
   }
   return val;
 }
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
index ed9a2ae..5bf783d 100644
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -832,8 +832,8 @@ static mlir::Type getEleTy(mlir::Type ty) {
 static bool isAssumedSize(llvm::SmallVectorImpl<mlir::Value> &extents) {
   if (extents.empty())
     return false;
-  auto cstLen = fir::getIntIfConstant(extents.back());
-  return cstLen.has_value() && *cstLen == -1;
+  return llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(
+      extents.back().getDefiningOp());
 }
 
 // Extract extents from the ShapeOp/ShapeShiftOp into the result vector.
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 3d2db6a..caa9bdd 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -131,6 +131,9 @@ struct FindHostArray
     return (*this)(x.base());
   }
   Result operator()(const Symbol &symbol) const {
+    if (symbol.IsFuncResult()) {
+      return nullptr;
+    }
     if (const auto *details{
             symbol.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()}) {
       if (details->IsArray() &&
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index be10669..4141630 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -61,6 +61,124 @@ namespace Fortran::semantics {
 using namespace Fortran::semantics::omp;
 using namespace Fortran::parser::omp;
 
+OmpStructureChecker::OmpStructureChecker(SemanticsContext &context)
+    : DirectiveStructureChecker(context,
+#define GEN_FLANG_DIRECTIVE_CLAUSE_MAP
+#include "llvm/Frontend/OpenMP/OMP.inc"
+      ) {
+  scopeStack_.push_back(&context.globalScope());
+}
+
+bool OmpStructureChecker::Enter(const parser::MainProgram &x) {
+  using StatementProgramStmt = parser::Statement<parser::ProgramStmt>;
+  if (auto &stmt{std::get<std::optional<StatementProgramStmt>>(x.t)}) {
+    scopeStack_.push_back(stmt->statement.v.symbol->scope());
+  } else {
+    for (const Scope &scope : context_.globalScope().children()) {
+      // There can only be one main program.
+      if (scope.kind() == Scope::Kind::MainProgram) {
+        scopeStack_.push_back(&scope);
+        break;
+      }
+    }
+  }
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::MainProgram &x) {
+  scopeStack_.pop_back();
+}
+
+bool OmpStructureChecker::Enter(const parser::BlockData &x) {
+  // The BLOCK DATA name is optional, so we need to look for the
+  // corresponding scope in the global scope.
+  auto &stmt{std::get<parser::Statement<parser::BlockDataStmt>>(x.t)};
+  if (auto &name{stmt.statement.v}) {
+    scopeStack_.push_back(name->symbol->scope());
+  } else {
+    for (const Scope &scope : context_.globalScope().children()) {
+      if (scope.kind() == Scope::Kind::BlockData) {
+        if (scope.symbol()->name().empty()) {
+          scopeStack_.push_back(&scope);
+          break;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::BlockData &x) {
+  scopeStack_.pop_back();
+}
+
+bool OmpStructureChecker::Enter(const parser::Module &x) {
+  auto &stmt{std::get<parser::Statement<parser::ModuleStmt>>(x.t)};
+  const Symbol *sym{stmt.statement.v.symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::Module &x) {
+  scopeStack_.pop_back();
+}
+
+bool OmpStructureChecker::Enter(const parser::Submodule &x) {
+  auto &stmt{std::get<parser::Statement<parser::SubmoduleStmt>>(x.t)};
+  const Symbol *sym{std::get<parser::Name>(stmt.statement.t).symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::Submodule &x) {
+  scopeStack_.pop_back();
+}
+
+// Function/subroutine subprogram nodes don't appear in INTERFACEs, but
+// the subprogram/end statements do.
+bool OmpStructureChecker::Enter(const parser::SubroutineStmt &x) {
+  const Symbol *sym{std::get<parser::Name>(x.t).symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::EndSubroutineStmt &x) {
+  scopeStack_.pop_back();
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::FunctionStmt &x) {
+  const Symbol *sym{std::get<parser::Name>(x.t).symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::EndFunctionStmt &x) {
+  scopeStack_.pop_back();
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::BlockConstruct &x) {
+  auto &specPart{std::get<parser::BlockSpecificationPart>(x.t)};
+  auto &execPart{std::get<parser::Block>(x.t)};
+  if (auto &&source{parser::GetSource(specPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  } else if (auto &&source{parser::GetSource(execPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  }
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::BlockConstruct &x) {
+  auto &specPart{std::get<parser::BlockSpecificationPart>(x.t)};
+  auto &execPart{std::get<parser::Block>(x.t)};
+  if (auto &&source{parser::GetSource(specPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  } else if (auto &&source{parser::GetSource(execPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  }
+}
+
 // Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
 #define CHECK_SIMPLE_CLAUSE(X, Y) \
   void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
@@ -362,6 +480,36 @@ bool OmpStructureChecker::IsNestedInDirective(llvm::omp::Directive directive) {
   return false;
 }
 
+bool OmpStructureChecker::InTargetRegion() {
+  if (IsNestedInDirective(llvm::omp::Directive::OMPD_target)) {
+    // Return true even for device_type(host).
+    return true;
+  }
+  for (const Scope *scope : llvm::reverse(scopeStack_)) {
+    if (const auto *symbol{scope->symbol()}) {
+      if (symbol->test(Symbol::Flag::OmpDeclareTarget)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool OmpStructureChecker::HasRequires(llvm::omp::Clause req) {
+  const Scope &unit{GetProgramUnit(*scopeStack_.back())};
+  return common::visit(
+      [&](const auto &details) {
+        if constexpr (std::is_convertible_v<decltype(details),
+                          const WithOmpDeclarative &>) {
+          if (auto *reqs{details.ompRequires()}) {
+            return reqs->test(req);
+          }
+        }
+        return false;
+      },
+      DEREF(unit.symbol()).details());
+}
+
 void OmpStructureChecker::CheckVariableListItem(
     const SymbolSourceMap &symbols) {
   for (auto &[symbol, source] : symbols) {
@@ -1562,40 +1710,95 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
   dirContext_.pop_back();
 }
 
-void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
-  isPredefinedAllocator = true;
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
-  SymbolSourceMap currSymbols;
-  GetSymbolsInObjectList(objectList, currSymbols);
-  for (auto &[symbol, source] : currSymbols) {
-    if (IsPointer(*symbol)) {
-      context_.Say(source,
-          "List item '%s' in ALLOCATE directive must not have POINTER "
-          "attribute"_err_en_US,
-          source.ToString());
+void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
+    const parser::OmpObjectList &objects,
+    const parser::OmpClauseList &clauses) {
+  const Scope &thisScope{context_.FindScope(source)};
+  SymbolSourceMap symbols;
+  GetSymbolsInObjectList(objects, symbols);
+
+  auto maybeHasPredefinedAllocator{[&](const parser::OmpClause *calloc) {
+    // Return "true" if the ALLOCATOR clause was provided with an argument
+    // that is either a prefdefined allocator, or a run-time value.
+    // Otherwise return "false".
+    if (!calloc) {
+      return false;
     }
-    if (IsDummy(*symbol)) {
+    auto *allocator{std::get_if<parser::OmpClause::Allocator>(&calloc->u)};
+    if (auto val{ToInt64(GetEvaluateExpr(DEREF(allocator).v))}) {
+      // Predefined allocators (defined in OpenMP 6.0 20.8.1):
+      //   omp_null_allocator = 0,
+      //   omp_default_mem_alloc = 1,
+      //   omp_large_cap_mem_alloc = 2,
+      //   omp_const_mem_alloc = 3,
+      //   omp_high_bw_mem_alloc = 4,
+      //   omp_low_lat_mem_alloc = 5,
+      //   omp_cgroup_mem_alloc = 6,
+      //   omp_pteam_mem_alloc = 7,
+      //   omp_thread_mem_alloc = 8
+      return *val >= 0 && *val <= 8;
+    }
+    return true;
+  }};
+
+  const auto *allocator{FindClause(llvm::omp::Clause::OMPC_allocator)};
+  if (InTargetRegion()) {
+    bool hasDynAllocators{
+        HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)};
+    if (!allocator && !hasDynAllocators) {
       context_.Say(source,
-          "List item '%s' in ALLOCATE directive must not be a dummy "
-          "argument"_err_en_US,
-          source.ToString());
+          "An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US);
+    }
+  }
+
+  auto maybePredefined{maybeHasPredefinedAllocator(allocator)};
+
+  for (auto &[symbol, source] : symbols) {
+    if (!inExecutableAllocate_) {
+      if (symbol->owner() != thisScope) {
+        context_.Say(source,
+            "A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears"_err_en_US);
+      }
+      if (IsPointer(*symbol) || IsAllocatable(*symbol)) {
+        context_.Say(source,
+            "A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute"_err_en_US);
+      }
     }
     if (symbol->GetUltimate().has<AssocEntityDetails>()) {
       context_.Say(source,
-          "List item '%s' in ALLOCATE directive must not be an associate "
-          "name"_err_en_US,
-          source.ToString());
+          "A list item in a declarative ALLOCATE cannot be an associate name"_err_en_US);
+    }
+    if (symbol->attrs().test(Attr::SAVE) || IsCommonBlock(*symbol)) {
+      if (!allocator) {
+        context_.Say(source,
+            "If a list item is a named common block or has SAVE attribute, an ALLOCATOR clause must be present with a predefined allocator"_err_en_US);
+      } else if (!maybePredefined) {
+        context_.Say(source,
+            "If a list item is a named common block or has SAVE attribute, only a predefined allocator may be used on the ALLOCATOR clause"_err_en_US);
+      }
+    }
+    if (FindCommonBlockContaining(*symbol)) {
+      context_.Say(source,
+          "A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block"_err_en_US);
     }
   }
-  CheckVarIsNotPartOfAnotherVar(dir.source, objectList);
+  CheckVarIsNotPartOfAnotherVar(source, objects);
 }
 
-void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) {
+void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
   const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
-  CheckPredefinedAllocatorRestriction(dir.source, objectList);
+  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+}
+
+void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) {
+  if (!inExecutableAllocate_) {
+    const auto &dir{std::get<parser::Verbatim>(x.t)};
+    const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
+    const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
+
+    isPredefinedAllocator = true;
+    CheckAllocateDirective(dir.source, objectList, clauseList);
+  }
   dirContext_.pop_back();
 }
 
@@ -1951,6 +2154,7 @@ void OmpStructureChecker::CheckNameInAllocateStmt(
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
+  inExecutableAllocate_ = true;
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
 
@@ -1960,24 +2164,6 @@ void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
         "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
   }
 
-  bool hasAllocator = false;
-  // TODO: Investigate whether searching the clause list can be done with
-  // parser::Unwrap instead of the following loop
-  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
-  for (const auto &clause : clauseList.v) {
-    if (std::get_if<parser::OmpClause::Allocator>(&clause.u)) {
-      hasAllocator = true;
-    }
-  }
-
-  if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) && !hasAllocator) {
-    // TODO: expand this check to exclude the case when a requires
-    //       directive with the dynamic_allocators clause is present
-    //       in the same compilation unit (OMP5.0 2.11.3).
-    context_.Say(x.source,
-        "ALLOCATE directives that appear in a TARGET region must specify an allocator clause"_err_en_US);
-  }
-
   const auto &allocateStmt =
       std::get<parser::Statement<parser::AllocateStmt>>(x.t).statement;
   if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
@@ -1994,18 +2180,34 @@ void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
   }
 
   isPredefinedAllocator = true;
-  const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
-  if (objectList) {
-    CheckVarIsNotPartOfAnotherVar(dir.source, *objectList);
-  }
 }
 
 void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) {
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
-  if (objectList)
-    CheckPredefinedAllocatorRestriction(dir.source, *objectList);
+  parser::OmpObjectList empty{std::list<parser::OmpObject>{}};
+  auto &objects{[&]() -> const parser::OmpObjectList & {
+    if (auto &objects{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
+      return *objects;
+    } else {
+      return empty;
+    }
+  }()};
+  auto &clauses{std::get<parser::OmpClauseList>(x.t)};
+  CheckAllocateDirective(
+      std::get<parser::Verbatim>(x.t).source, objects, clauses);
+
+  if (const auto &subDirs{
+          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
+              x.t)}) {
+    for (const auto &dalloc : *subDirs) {
+      const auto &dir{std::get<parser::Verbatim>(x.t)};
+      const auto &clauses{std::get<parser::OmpClauseList>(dalloc.t)};
+      const auto &objects{std::get<parser::OmpObjectList>(dalloc.t)};
+      CheckAllocateDirective(dir.source, objects, clauses);
+    }
+  }
+
   dirContext_.pop_back();
+  inExecutableAllocate_ = false;
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) {
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index b3fd6c8..7426559 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -56,21 +56,32 @@ using SymbolSourceMap = std::multimap<const Symbol *, parser::CharBlock>;
 using DirectivesClauseTriple = std::multimap<llvm::omp::Directive,
     std::pair<llvm::omp::Directive, const OmpClauseSet>>;
 
-class OmpStructureChecker
-    : public DirectiveStructureChecker<llvm::omp::Directive, llvm::omp::Clause,
-          parser::OmpClause, llvm::omp::Clause_enumSize> {
+using OmpStructureCheckerBase = DirectiveStructureChecker<llvm::omp::Directive,
+    llvm::omp::Clause, parser::OmpClause, llvm::omp::Clause_enumSize>;
+
+class OmpStructureChecker : public OmpStructureCheckerBase {
 public:
-  using Base = DirectiveStructureChecker<llvm::omp::Directive,
-      llvm::omp::Clause, parser::OmpClause, llvm::omp::Clause_enumSize>;
+  using Base = OmpStructureCheckerBase;
+
+  OmpStructureChecker(SemanticsContext &context);
 
-  OmpStructureChecker(SemanticsContext &context)
-      : DirectiveStructureChecker(context,
-#define GEN_FLANG_DIRECTIVE_CLAUSE_MAP
-#include "llvm/Frontend/OpenMP/OMP.inc"
-        ) {
-  }
   using llvmOmpClause = const llvm::omp::Clause;
 
+  bool Enter(const parser::MainProgram &);
+  void Leave(const parser::MainProgram &);
+  bool Enter(const parser::BlockData &);
+  void Leave(const parser::BlockData &);
+  bool Enter(const parser::Module &);
+  void Leave(const parser::Module &);
+  bool Enter(const parser::Submodule &);
+  void Leave(const parser::Submodule &);
+  bool Enter(const parser::SubroutineStmt &);
+  bool Enter(const parser::EndSubroutineStmt &);
+  bool Enter(const parser::FunctionStmt &);
+  bool Enter(const parser::EndFunctionStmt &);
+  bool Enter(const parser::BlockConstruct &);
+  void Leave(const parser::BlockConstruct &);
+
   void Enter(const parser::OpenMPConstruct &);
   void Leave(const parser::OpenMPConstruct &);
   void Enter(const parser::OpenMPInteropConstruct &);
@@ -177,10 +188,12 @@ private:
       const parser::CharBlock &, const OmpDirectiveSet &);
   bool IsCloselyNestedRegion(const OmpDirectiveSet &set);
   bool IsNestedInDirective(llvm::omp::Directive directive);
+  bool InTargetRegion();
   void HasInvalidTeamsNesting(
       const llvm::omp::Directive &dir, const parser::CharBlock &source);
   void HasInvalidDistributeNesting(const parser::OpenMPLoopConstruct &x);
   void HasInvalidLoopBinding(const parser::OpenMPLoopConstruct &x);
+  bool HasRequires(llvm::omp::Clause req);
   // specific clause related
   void CheckAllowedMapTypes(
       parser::OmpMapType::Value, llvm::ArrayRef<parser::OmpMapType::Value>);
@@ -250,6 +263,9 @@ private:
   bool CheckTargetBlockOnlyTeams(const parser::Block &);
   void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock);
   void CheckWorkdistributeBlockStmts(const parser::Block &, parser::CharBlock);
+  void CheckAllocateDirective(parser::CharBlock source,
+      const parser::OmpObjectList &objects,
+      const parser::OmpClauseList &clauses);
 
   void CheckIteratorRange(const parser::OmpIteratorSpecifier &x);
   void CheckIteratorModifier(const parser::OmpIterator &x);
@@ -367,12 +383,15 @@ private:
   };
   int directiveNest_[LastType + 1] = {0};
 
+  bool inExecutableAllocate_{false};
   parser::CharBlock visitedAtomicSource_;
   SymbolSourceMap deferredNonVariables_;
 
   using LoopConstruct = std::variant<const parser::DoConstruct *,
       const parser::OpenMPLoopConstruct *>;
   std::vector<LoopConstruct> loopStack_;
+  // Scopes for scoping units.
+  std::vector<const Scope *> scopeStack_;
 };
 
 /// Find a duplicate entry in the range, and return an iterator to it.
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index 292e73b..cc55bb4 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -218,7 +218,7 @@ bool IsMapExitingType(parser::OmpMapType::Value type) {
   }
 }
 
-std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr) {
+MaybeExpr GetEvaluateExpr(const parser::Expr &parserExpr) {
   const parser::TypedExpr &typedExpr{parserExpr.typedExpr};
   // ForwardOwningPointer           typedExpr
   // `- GenericExprWrapper          ^.get()
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index c410bd4..196755e 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -3094,26 +3094,6 @@ void OmpAttributeVisitor::ResolveOmpDesignator(
         AddAllocateName(name);
       }
     }
-    if (ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective &&
-        IsAllocatable(*symbol) &&
-        !IsNestedInDirective(llvm::omp::Directive::OMPD_allocate)) {
-      context_.Say(designator.source,
-          "List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement"_err_en_US);
-    }
-    bool checkScope{ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective};
-    // In 5.1 the scope check only applies to declarative allocate.
-    if (version == 50 && !checkScope) {
-      checkScope = ompFlag == Symbol::Flag::OmpExecutableAllocateDirective;
-    }
-    if (checkScope) {
-      if (omp::GetScopingUnit(GetContext().scope) !=
-          omp::GetScopingUnit(symbol->GetUltimate().owner())) {
-        context_.Say(designator.source, // 2.15.3
-            "List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US,
-            parser::ToUpperCaseLetters(
-                llvm::omp::getOpenMPDirectiveName(directive, version)));
-      }
-    }
     if (ompFlag == Symbol::Flag::OmpReduction) {
       // Using variables inside of a namelist in OpenMP reductions
       // is allowed by the standard, but is not allowed for
diff --git a/flang/lib/Utils/OpenMP.cpp b/flang/lib/Utils/OpenMP.cpp
index 15a42c3..c2036c4 100644
--- a/flang/lib/Utils/OpenMP.cpp
+++ b/flang/lib/Utils/OpenMP.cpp
@@ -112,7 +112,7 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
   mlir::Block *entryBlock = &region.getBlocks().front();
   firOpBuilder.setInsertionPointToStart(entryBlock);
   auto loadOp =
-      firOpBuilder.create<fir::LoadOp>(clonedValArg.getLoc(), clonedValArg);
+      fir::LoadOp::create(firOpBuilder, clonedValArg.getLoc(), clonedValArg);
   return loadOp.getResult();
 }
 
diff --git a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
index a334934..48fee10 100644
--- a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
+++ b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
@@ -55,3 +55,56 @@ func.func @main(%arg0: complex<f64>) {
 // CHECK-SAME: (%arg0: f64, %arg1: f64) kernel {
 // CHECK: gpu.return
 // CHECK: gpu.launch_func  @testmod::@_QPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 dynamic_shared_memory_size %{{.*}} args(%{{.*}} : f64, %{{.*}} : f64) {cuf.proc_attr = #cuf.cuda_proc<global>}
+
+// -----
+
+module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  gpu.module @testmod {
+    gpu.func @_QMbarPfoo(%arg0: f32, %arg1: !fir.ref<!fir.array<100xf32>>, %arg2: !fir.boxchar<1>) workgroup(%arg3 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @_QMbarPfoo(
+// CHECK-SAME: %{{.*}}: f32, %{{.*}}: !fir.ref<!fir.array<100xf32>>, %[[CHAR:.*]]: !fir.ref<!fir.char<1,?>>, %[[LENGTH:.*]]: i64) workgroup(%[[WORKGROUP:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
+// CHECK: memref.store %{{.*}}, %[[WORKGROUP]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+
+    gpu.func @_QMbarPfoo2(%arg0: f32, %arg1: !fir.ref<!fir.array<100xf32>>, %arg2: !fir.boxchar<1>) workgroup(%arg3 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}, %arg4 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      memref.store %arg0, %arg4[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @_QMbarPfoo2(
+// CHECK-SAME: %{{.*}}: f32, %{{.*}}: !fir.ref<!fir.array<100xf32>>, %[[CHAR:.*]]: !fir.ref<!fir.char<1,?>>, %[[LENGTH:.*]]: i64) workgroup(%[[WG1:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}, %[[WG2:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
+// CHECK: memref.store %{{.*}}, %[[WG1]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+// CHECK: memref.store %{{.*}}, %[[WG2]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+
+    gpu.func @_QMbarPprivate(%arg0: f32, %arg1: !fir.boxchar<1>) workgroup(%arg2 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) private(%arg3 : memref<1xf32, #gpu.address_space<private>> {llvm.align = 16 : i32}) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg2[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space<private>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @_QMbarPprivate(
+// CHECK-SAME: %{{.*}}: f32, %[[CHAR:.*]]: !fir.ref<!fir.char<1,?>>, %[[LENGTH:.*]]: i64) workgroup(%[[WG:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) private(%[[PRIVATE:.*]] : memref<1xf32, #gpu.address_space<private>> {llvm.align = 16 : i32}) {
+// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
+// CHECK: memref.store %{{.*}}, %[[WG]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+// CHECK: memref.store %{{.*}}, %[[PRIVATE]][%{{.*}}] : memref<1xf32, #gpu.address_space<private>>
+    
+    gpu.func @test_with_char_proc(%arg0: f32, %arg1: tuple<() -> (), i64> {fir.char_proc}) workgroup(%arg2 : memref<1xf32, #gpu.address_space<workgroup>>) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg2[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @test_with_char_proc(
+// CHECK-SAME: %{{.*}}: f32, %[[CHARPROC:.*]]: () -> () {fir.char_proc}, %[[LENGTH:.*]]: i64) workgroup(%[[WG:.*]] : memref<1xf32, #gpu.address_space<workgroup>>) {
+// CHECK: %{{.*}} = fir.undefined tuple<() -> (), i64>
+// CHECK: %{{.*}} = fir.insert_value %{{.*}}, %[[CHARPROC]], [0 : index] : (tuple<() -> (), i64>, () -> ()) -> tuple<() -> (), i64>
+// CHECK: %{{.*}} = fir.insert_value %{{.*}}, %[[LENGTH]], [1 : index] : (tuple<() -> (), i64>, i64) -> tuple<() -> (), i64>
+// CHECK: memref.store %{{.*}}, %[[WG]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+  }
+}
+
diff --git a/flang/test/Fir/assumed-size-ops-codegen.fir b/flang/test/Fir/assumed-size-ops-codegen.fir
new file mode 100644
index 0000000..54e9b3c
--- /dev/null
+++ b/flang/test/Fir/assumed-size-ops-codegen.fir
@@ -0,0 +1,19 @@
+// RUN: fir-opt --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" %s | FileCheck %s
+
+// CHECK-LABEL: @assumed_size_extent(
+// CHECK: %[[CNEG1:.*]] = llvm.mlir.constant(-1 : i64)
+// CHECK: llvm.return %[[CNEG1]] : i64
+func.func @assumed_size_extent() -> index {
+  %e = fir.assumed_size_extent : index
+  return %e : index
+}
+
+// CHECK-LABEL:  @is_assumed_size_extent(
+// CHECK: %[[NEG1:.*]] = llvm.mlir.constant(-1 : i64)
+// CHECK: %[[CMP:.*]] = llvm.icmp "eq"
+// CHECK: llvm.return %[[CMP]] : i1
+func.func @is_assumed_size_extent(%x: index) -> i1 {
+  %c = fir.is_assumed_size_extent %x : (index) -> i1
+  return %c : i1
+}
+
diff --git a/flang/test/Fir/assumed-size-ops-folding.fir b/flang/test/Fir/assumed-size-ops-folding.fir
new file mode 100644
index 0000000..9fd5fab
--- /dev/null
+++ b/flang/test/Fir/assumed-size-ops-folding.fir
@@ -0,0 +1,13 @@
+// RUN: fir-opt --canonicalize %s | FileCheck %s
+
+// Verify: fir.is_assumed_size_extent(fir.assumed_size_extent) folds to i1 true.
+
+// CHECK-LABEL: func.func @fold(
+func.func @fold() -> i1 {
+  %e = fir.assumed_size_extent : index
+  // CHECK: %[[C:.*]] = arith.constant true
+  %t = fir.is_assumed_size_extent %e : (index) -> i1
+  return %t : i1
+}
+
+
diff --git a/flang/test/Fir/assumed-size-ops-roundtrip.fir b/flang/test/Fir/assumed-size-ops-roundtrip.fir
new file mode 100644
index 0000000..c3c1883
--- /dev/null
+++ b/flang/test/Fir/assumed-size-ops-roundtrip.fir
@@ -0,0 +1,13 @@
+// RUN: fir-opt %s | fir-opt | FileCheck %s
+
+func.func @roundtrip() {
+  // CHECK: %[[E:.*]] = fir.assumed_size_extent : index
+  %e = fir.assumed_size_extent : index
+
+  // CHECK: %[[T:.*]] = fir.is_assumed_size_extent %[[E]] : (index) -> i1
+  %t = fir.is_assumed_size_extent %e : (index) -> i1
+
+  return
+}
+
+
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 5159c91..6d2beae 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -161,4 +161,5 @@ func.func @_QQmain() {
 // PASSES-NEXT:  LowerNontemporalPass
 // PASSES-NEXT: FIRToLLVMLowering
 // PASSES-NEXT: ReconcileUnrealizedCasts
+// PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass
 // PASSES-NEXT: LLVMIRLoweringPass
diff --git a/flang/test/HLFIR/assumed-type-actual-args.f90 b/flang/test/HLFIR/assumed-type-actual-args.f90
index 42e9ed2..aaac98b 100644
--- a/flang/test/HLFIR/assumed-type-actual-args.f90
+++ b/flang/test/HLFIR/assumed-type-actual-args.f90
@@ -113,7 +113,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_1:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.array<?xnone>>)
 ! CHECK:           fir.call @_QPs2(%[[VAL_3]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> ()
diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
index 9ecbb7c..ffb36fa 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-iface.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
@@ -145,7 +145,7 @@ end subroutine
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_4:.*]] = arith.cmpi sgt, %[[VAL_2]], %[[VAL_3]] : index
 ! CHECK:           %[[VAL_5:.*]] = arith.select %[[VAL_4]], %[[VAL_2]], %[[VAL_3]] : index
-! CHECK:           %[[VAL_6:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_6:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]], %[[VAL_6]] : (index, index) -> !fir.shape<2>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.box<!fir.array<10x?xi32>>) -> !fir.box<!fir.array<*:i32>>
diff --git a/flang/test/Lower/HLFIR/select-rank.f90 b/flang/test/Lower/HLFIR/select-rank.f90
index 0f80c72..f1f968de 100644
--- a/flang/test/Lower/HLFIR/select-rank.f90
+++ b/flang/test/Lower/HLFIR/select-rank.f90
@@ -371,7 +371,7 @@ end subroutine
 ! CHECK:           fir.call @_QPr1(%[[VAL_11]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb5:
-! CHECK:           %[[VAL_12:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_12:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_13:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (!fir.ref<!fir.array<*:f32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
@@ -435,7 +435,7 @@ end subroutine
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_8]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb5
 ! CHECK:         ^bb4:
-! CHECK:           %[[VAL_9:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_9:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_10:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.ref<!fir.array<*:f32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
@@ -482,7 +482,7 @@ end subroutine
 ! CHECK:           fir.call @_QPr1_implicit(%[[VAL_21]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb5:
-! CHECK:           %[[VAL_22:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_22:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_23:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (!fir.ref<!fir.array<*:f32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_25:.*]] = fir.shape %[[VAL_22]] : (index) -> !fir.shape<1>
@@ -534,7 +534,7 @@ end subroutine
 ! CHECK:           fir.call @_QPrc1_implicit(%[[VAL_26]]) fastmath<contract> : (!fir.boxchar<1>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb5:
-! CHECK:           %[[VAL_27:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_27:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_28:.*]] = fir.box_addr %[[VAL_8]]#1 : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> !fir.ref<!fir.array<*:!fir.char<1,?>>>
 ! CHECK:           %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (!fir.ref<!fir.array<*:!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_30:.*]] = fir.shape %[[VAL_27]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/Intrinsics/lbound.f90 b/flang/test/Lower/Intrinsics/lbound.f90
index a5ca2d3..75c11ff 100644
--- a/flang/test/Lower/Intrinsics/lbound.f90
+++ b/flang/test/Lower/Intrinsics/lbound.f90
@@ -40,7 +40,7 @@ end subroutine
 subroutine lbound_test_3(a, dim, res)
   real, dimension(2:10, 3:*) :: a
   integer(8):: dim, res
-! CHECK:  %[[VAL_0:.*]] = arith.constant -1 : index
+! CHECK:  %[[VAL_0:.*]] = fir.assumed_size_extent : index
 ! CHECK:  %[[VAL_1:.*]] = fir.load %arg1 : !fir.ref<i64>
 ! CHECK:  %[[VAL_2:.*]] = fir.shape_shift %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_0]] : (index, index, index, index) -> !fir.shapeshift<2>
 ! CHECK:         %[[VAL_3:.*]] = fir.embox %arg0(%[[VAL_2]]) : (!fir.ref<!fir.array<9x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<9x?xf32>>
diff --git a/flang/test/Lower/Intrinsics/ubound.f90 b/flang/test/Lower/Intrinsics/ubound.f90
index dae21ac..bc8cff8 100644
--- a/flang/test/Lower/Intrinsics/ubound.f90
+++ b/flang/test/Lower/Intrinsics/ubound.f90
@@ -48,7 +48,7 @@ end subroutine
 subroutine ubound_test_3(a, dim, res)
   real, dimension(10, 20, *) :: a
   integer(8):: dim, res
-! CHECK:         %[[VAL_0:.*]] = arith.constant -1 : index
+! CHECK:         %[[VAL_0:.*]] = fir.assumed_size_extent : index
 ! CHECK:         %[[VAL_1:.*]] = fir.shape %{{.*}}, %{{.*}}, %[[VAL_0]] : (index, index, index) -> !fir.shape<3>
 ! CHECK:         %[[VAL_2:.*]] = fir.embox %{{.*}}(%[[VAL_1]]) : (!fir.ref<!fir.array<10x20x?xf32>>, !fir.shape<3>) -> !fir.box<!fir.array<10x20x?xf32>>
 ! CHECK:         %[[VAL_3:.*]] = fir.load %{{.*}} : !fir.ref<i64>
diff --git a/flang/test/Lower/OpenACC/acc-data-operands-remapping-common.f90 b/flang/test/Lower/OpenACC/acc-data-operands-remapping-common.f90
new file mode 100644
index 0000000..1ab883e
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-data-operands-remapping-common.f90
@@ -0,0 +1,43 @@
+! Test remapping of common blocks appearing in OpenACC data directives.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+subroutine test
+  real :: x(100), y(100), overlap1(100), overlap2(100)
+  equivalence (x(50), overlap1)
+  equivalence (x(40), overlap2)
+  common /comm/ x, y
+  !$acc declare link(/comm/)
+  !$acc parallel loop copyin(/comm/)
+    do i = 1, 100
+	   x(i) = overlap1(i)*2+ overlap2(i)
+    enddo
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest() {
+! CHECK:           %[[ADDRESS_OF_0:.*]] = fir.address_of(@comm_)
+! CHECK:           %[[COPYIN_0:.*]] = acc.copyin varPtr(%[[ADDRESS_OF_0]] : !fir.ref<!fir.array<800xi8>>) -> !fir.ref<!fir.array<800xi8>> {name = "comm"}
+! CHECK:           acc.parallel combined(loop) dataOperands(%[[COPYIN_0]] : !fir.ref<!fir.array<800xi8>>) {
+! CHECK:             %[[CONSTANT_8:.*]] = arith.constant 196 : index
+! CHECK:             %[[COORDINATE_OF_4:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref<!fir.array<800xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[CONVERT_4:.*]] = fir.convert %[[COORDINATE_OF_4]] : (!fir.ref<i8>) -> !fir.ptr<!fir.array<100xf32>>
+! CHECK:             %[[SHAPE_4:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK:             %[[DECLARE_5:.*]]:2 = hlfir.declare %[[CONVERT_4]](%[[SHAPE_4]]) storage(%[[COPYIN_0]][196]) {uniq_name = "_QFtestEoverlap1"} : (!fir.ptr<!fir.array<100xf32>>, !fir.shape<1>, !fir.ref<!fir.array<800xi8>>) -> (!fir.ptr<!fir.array<100xf32>>, !fir.ptr<!fir.array<100xf32>>)
+! CHECK:             %[[CONSTANT_9:.*]] = arith.constant 156 : index
+! CHECK:             %[[COORDINATE_OF_5:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref<!fir.array<800xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[CONVERT_5:.*]] = fir.convert %[[COORDINATE_OF_5]] : (!fir.ref<i8>) -> !fir.ptr<!fir.array<100xf32>>
+! CHECK:             %[[SHAPE_5:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK:             %[[DECLARE_6:.*]]:2 = hlfir.declare %[[CONVERT_5]](%[[SHAPE_5]]) storage(%[[COPYIN_0]][156]) {uniq_name = "_QFtestEoverlap2"} : (!fir.ptr<!fir.array<100xf32>>, !fir.shape<1>, !fir.ref<!fir.array<800xi8>>) -> (!fir.ptr<!fir.array<100xf32>>, !fir.ptr<!fir.array<100xf32>>)
+! CHECK:             %[[CONSTANT_10:.*]] = arith.constant 0 : index
+! CHECK:             %[[COORDINATE_OF_6:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref<!fir.array<800xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[CONVERT_6:.*]] = fir.convert %[[COORDINATE_OF_6]] : (!fir.ref<i8>) -> !fir.ptr<!fir.array<100xf32>>
+! CHECK:             %[[SHAPE_6:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK:             %[[DECLARE_7:.*]]:2 = hlfir.declare %[[CONVERT_6]](%[[SHAPE_6]]) storage(%[[COPYIN_0]][0]) {uniq_name = "_QFtestEx"} : (!fir.ptr<!fir.array<100xf32>>, !fir.shape<1>, !fir.ref<!fir.array<800xi8>>) -> (!fir.ptr<!fir.array<100xf32>>, !fir.ptr<!fir.array<100xf32>>)
+! CHECK:             %[[CONSTANT_11:.*]] = arith.constant 400 : index
+! CHECK:             %[[COORDINATE_OF_7:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref<!fir.array<800xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[CONVERT_7:.*]] = fir.convert %[[COORDINATE_OF_7]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<100xf32>>
+! CHECK:             %[[SHAPE_7:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK:             %[[DECLARE_8:.*]]:2 = hlfir.declare %[[CONVERT_7]](%[[SHAPE_7]]) storage(%[[COPYIN_0]][400]) {uniq_name = "_QFtestEy"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.ref<!fir.array<800xi8>>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:             acc.loop combined(parallel)
+! CHECK:               %[[DESIGNATE_0:.*]] = hlfir.designate %[[DECLARE_5]]#0
+! CHECK:               %[[DESIGNATE_1:.*]] = hlfir.designate %[[DECLARE_6]]#0
+! CHECK:               %[[DESIGNATE_2:.*]] = hlfir.designate %[[DECLARE_7]]#0
diff --git a/flang/test/Lower/array-expression-assumed-size.f90 b/flang/test/Lower/array-expression-assumed-size.f90
index 2fbf315..a498148 100644
--- a/flang/test/Lower/array-expression-assumed-size.f90
+++ b/flang/test/Lower/array-expression-assumed-size.f90
@@ -19,7 +19,7 @@ end subroutine assumed_size_forall_test
 ! CHECK:         %[[VAL_1A:.*]] = fir.convert %c10{{.*}} : (i64) -> index 
 ! CHECK:         %[[VAL_1B:.*]] = arith.cmpi sgt, %[[VAL_1A]], %c0{{.*}} : index 
 ! CHECK:         %[[VAL_1:.*]] = arith.select %[[VAL_1B]], %[[VAL_1A]], %c0{{.*}} : index
-! CHECK:         %[[VAL_2:.*]] = arith.constant -1 : index
+! CHECK:         %[[VAL_2:.*]] = fir.assumed_size_extent : index
 ! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : index
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 1 : i64
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
@@ -82,7 +82,7 @@ end subroutine assumed_size_forall_test
 ! CHECK:         %[[VAL_2A:.*]] = fir.convert %c10{{.*}} : (i64) -> index 
 ! CHECK:         %[[VAL_2B:.*]] = arith.cmpi sgt, %[[VAL_2A]], %c0{{.*}} : index 
 ! CHECK:         %[[VAL_2:.*]] = arith.select %[[VAL_2B]], %[[VAL_2A]], %c0{{.*}} : index
-! CHECK:         %[[VAL_3:.*]] = arith.constant -1 : index
+! CHECK:         %[[VAL_3:.*]] = fir.assumed_size_extent : index
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 2 : i32
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> index
 ! CHECK:         %[[VAL_6:.*]] = arith.constant 6 : i32
@@ -149,7 +149,7 @@ end subroutine assumed_size_forall_test
 ! PostOpt-DAG:         %[[VAL_4:.*]] = arith.constant 0 : index
 ! PostOpt-DAG:         %[[VAL_5:.*]] = arith.constant 3 : index
 ! PostOpt-DAG:         %[[VAL_6:.*]] = arith.constant 4 : index
-! PostOpt-DAG:         %[[VAL_7:.*]] = arith.constant -1 : index
+! PostOpt-DAG:         %[[VAL_7:.*]] = fir.assumed_size_extent : index
 ! PostOpt:         %[[VAL_8:.*]] = fir.shape %[[VAL_1]], %[[VAL_7]] : (index, index) -> !fir.shape<2>
 ! PostOpt:         %[[VAL_9:.*]] = fir.slice %[[VAL_2]], %[[VAL_1]], %[[VAL_2]], %[[VAL_2]], %[[VAL_3]], %[[VAL_2]] : (index, index, index, index, index, index) -> !fir.slice<2>
 ! PostOpt:         %[[VAL_10:.*]] = fir.allocmem !fir.array<10x?xi32>, %[[VAL_3]]
@@ -227,8 +227,8 @@ end subroutine assumed_size_forall_test
 ! PostOpt-DAG:         %[[VAL_4:.*]] = arith.constant 1 : index
 ! PostOpt-DAG:         %[[VAL_5:.*]] = arith.constant 0 : index
 ! PostOpt-DAG:         %[[VAL_6:.*]] = arith.constant 5 : index
-! PostOpt-DAG:         %[[VAL_8:.*]] = arith.constant -1 : index
 ! PostOpt:         %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"}
+! PostOpt:         %[[VAL_8:.*]] = fir.assumed_size_extent : index
 ! PostOpt:         %[[VAL_9:.*]] = fir.shape %[[VAL_2]], %[[VAL_8]] : (index, index) -> !fir.shape<2>
 ! PostOpt:         %[[VAL_10:.*]] = fir.allocmem !fir.array<10x?xi32>, %[[VAL_4]]
 ! PostOpt:         br ^bb1(%[[VAL_5]], %[[VAL_4]] : index, index)
diff --git a/flang/test/Lower/entry-statement.f90 b/flang/test/Lower/entry-statement.f90
index 83d2d32..f1e535a 100644
--- a/flang/test/Lower/entry-statement.f90
+++ b/flang/test/Lower/entry-statement.f90
@@ -491,7 +491,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPentry_with_assumed_size(
 ! CHECK-SAME:                                          %[[VAL_0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_2:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFassumed_sizeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:           cf.br ^bb1
diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-polymorphic.f90
new file mode 100644
index 0000000..2b7a51f
--- /dev/null
+++ b/flang/test/Lower/forall-polymorphic.f90
@@ -0,0 +1,89 @@
+! Test lower of FORALL polymorphic pointer assignment 
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+
+!! Test when LHS is polymorphic and RHS is not polymorphic
+! CHECK-LABEL: c.func @_QPforallpolymorphic
+  subroutine forallPolymorphic()
+  TYPE :: DT
+    CLASS(DT), POINTER    :: Ptr(:) => NULL()
+  END TYPE
+
+  TYPE, EXTENDS(DT) :: DT1
+  END TYPE
+
+  TYPE(DT1), TARGET  :: Tar1(10)
+  CLASS(DT), POINTER :: T(:)
+  integer :: I
+
+  FORALL (I=1:10)
+    T(I)%Ptr => Tar1
+  END FORALL
+
+! CHECK: %[[V_11:[0-9]+]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>> {bindc_name = "t", uniq_name = "_QFforallpolymorphicEt"}
+! CHECK: %[[V_15:[0-9]+]] = fir.declare %[[V_11]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFforallpolymorphicEt"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>) -> !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: %[[V_16:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>> {bindc_name = "tar1", fir.target, uniq_name = "_QFforallpolymorphicEtar1"}
+! CHECK: %[[V_17:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1>
+! CHECK: %[[V_18:[0-9]+]] = fir.declare %[[V_16]](%[[V_17]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFforallpolymorphicEtar1"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>
+! CHECK: %[[V_19:[0-9]+]] = fir.embox %[[V_18]](%[[V_17]]) : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>
+! CHECK: %[[V_34:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_35:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: fir.do_loop %arg0 = %[[V_34]] to %[[V_35]] step %c1
+! CHECK: {
+! CHECK: %[[V_36:[0-9]+]] = fir.convert %arg0 : (index) -> i32
+! CHECK: %[[V_37:[0-9]+]] = fir.load %[[V_15]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: %[[V_38:[0-9]+]] = fir.convert %[[V_36]] : (i32) -> i64
+! CHECK: %[[C0:.*]] = arith.constant 0 : index
+! CHECK: %[[V_39:[0-9]+]]:3 = fir.box_dims %37, %[[C0]] : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>, index) -> (index, index, index)
+! CHECK: %[[V_40:[0-9]+]] = fir.shift %[[V_39]]#0 : (index) -> !fir.shift<1>
+! CHECK: %[[V_41:[0-9]+]] = fir.array_coor %[[V_37]](%[[V_40]]) %[[V_38]] : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>, !fir.shift<1>, i64) -> !fir.ref<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>
+! CHECK: %[[V_42:[0-9]+]] = fir.embox %[[V_41]] source_box %[[V_37]] : (!fir.ref<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>, !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>) -> !fir.class<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>
+! CHECK: %[[V_43:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>
+! CHECK: %[[V_44:[0-9]+]] = fir.coordinate_of %[[V_42]], ptr : (!fir.class<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>) -> !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: %[[V_45:[0-9]+]] = fir.embox %[[V_18]](%[[V_17]]) : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>>
+! CHECK: %[[V_46:[0-9]+]] = fir.convert %[[V_45]] : (!fir.box<!fir.ptr<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>>) -> !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>
+! CHECK: fir.store %[[V_46]] to %[[V_44]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: }
+
+  end subroutine forallPolymorphic
+
+!! Test when LHS is not polymorphic but RHS is polymorphic
+! CHECK-LABEL: c.func @_QPforallpolymorphic2(
+! CHECK-SAME: %arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>> {fir.bindc_name = "tar1", fir.target}) {
+  subroutine forallPolymorphic2(Tar1)
+  TYPE :: DT
+    TYPE(DT), POINTER    :: Ptr(:) => NULL()
+  END TYPE
+
+  TYPE, EXTENDS(DT) :: DT1
+  END TYPE
+
+  CLASS(DT), ALLOCATABLE, TARGET  :: Tar1(:)
+  TYPE(DT) :: T(10)
+  integer :: I
+
+  FORALL (I=1:10)
+    T(I)%Ptr => Tar1
+  END FORALL
+
+! CHECK: %[[V_11:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>> {bindc_name = "t", uniq_name = "_QFforallpolymorphic2Et"}
+! CHECK: %[[V_12:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1>
+! CHECK: %[[V_13:[0-9]+]] = fir.declare %[[V_11]](%[[V_12]]) {uniq_name = "_QFforallpolymorphic2Et"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>
+! CHECK: %[[V_18:[0-9]+]] = fir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<allocatable, target>, uniq_name = "_QFforallpolymorphic2Etar1"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>, !fir.dscope) -> !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: %[[V_30:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_31:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: fir.do_loop %arg1 = %[[V_30]] to %[[V_31]] step %c1
+! CHECK: {
+! CHECK: %[[V_32:[0-9]+]] = fir.convert %arg1 : (index) -> i32
+! CHECK: %[[V_33:[0-9]+]] = fir.convert %[[V_32]] : (i32) -> i64
+! CHECK: %[[V_34:[0-9]+]] = fir.array_coor %[[V_13]](%[[V_12]]) %[[V_33]] : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>
+! CHECK: %[[V_35:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>
+! CHECK: %[[V_36:[0-9]+]] = fir.coordinate_of %[[V_34]], ptr : (!fir.ref<!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: %[[V_37:[0-9]+]] = fir.load %[[V_18]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: %[[V_38:[0-9]+]]:3 = fir.box_dims %[[V_37]], %c0 : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>, index) -> (index, index, index)
+! CHECK: %[[V_39:[0-9]+]] = fir.shift %[[V_38]]#0 : (index) -> !fir.shift<1>
+! CHECK: %[[V_40:[0-9]+]] = fir.rebox %[[V_37]](%[[V_39]]) : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>, !fir.shift<1>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>
+! CHECK: fir.store %[[V_40]] to %[[V_36]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: }
+
+  end subroutine forallPolymorphic2
+
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index 1d99811..229fd4d 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -15,7 +15,7 @@ use omp_lib
     integer :: a, b
     real, dimension (:,:), allocatable :: darray
 
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+    !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
     !$omp allocate(y)
         print *, a
 
diff --git a/flang/test/Semantics/OpenMP/allocate04.f90 b/flang/test/Semantics/OpenMP/allocate04.f90
index bbd74eb..5fd75ba 100644
--- a/flang/test/Semantics/OpenMP/allocate04.f90
+++ b/flang/test/Semantics/OpenMP/allocate04.f90
@@ -14,16 +14,19 @@ use iso_c_binding
   type(c_ptr), pointer :: p
   integer :: x, y, z
 
-  associate (a => x)
-  !$omp allocate(x) allocator(omp_default_mem_alloc)
-
   !ERROR: PRIVATE clause is not allowed on the ALLOCATE directive
   !$omp allocate(y) private(y)
-  !ERROR: List item 'z' in ALLOCATE directive must not be a dummy argument
-  !$omp allocate(z)
-  !ERROR: List item 'p' in ALLOCATE directive must not have POINTER attribute
+  !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute
   !$omp allocate(p)
-  !ERROR: List item 'a' in ALLOCATE directive must not be an associate name
+
+  associate (a => x)
+  block
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
+  !$omp allocate(x) allocator(omp_default_mem_alloc)
+
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
+  !ERROR: A list item in a declarative ALLOCATE cannot be an associate name
   !$omp allocate(a)
+  end block
   end associate
 end subroutine allocate
diff --git a/flang/test/Semantics/OpenMP/allocate05.f90 b/flang/test/Semantics/OpenMP/allocate05.f90
index a787e8b..b5f7864 100644
--- a/flang/test/Semantics/OpenMP/allocate05.f90
+++ b/flang/test/Semantics/OpenMP/allocate05.f90
@@ -18,7 +18,7 @@ use omp_lib
   !$omp end target
 
   !$omp target
-      !ERROR: ALLOCATE directives that appear in a TARGET region must specify an allocator clause
+      !ERROR: An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified
       !$omp allocate
           allocate ( darray(a, b) )
   !$omp end target
diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90
index e14134c..9b57322 100644
--- a/flang/test/Semantics/OpenMP/allocate06.f90
+++ b/flang/test/Semantics/OpenMP/allocate06.f90
@@ -11,7 +11,7 @@ use omp_lib
   integer :: a, b, x
   real, dimension (:,:), allocatable :: darray
 
-  !ERROR: List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement
+  !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
 
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
diff --git a/flang/test/Semantics/OpenMP/allocate08.f90 b/flang/test/Semantics/OpenMP/allocate08.f90
index 5bfa918..f4f11e2 100644
--- a/flang/test/Semantics/OpenMP/allocate08.f90
+++ b/flang/test/Semantics/OpenMP/allocate08.f90
@@ -3,14 +3,15 @@
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
 ! OpenMP Version 5.0
 ! 2.11.3 allocate Directive
-! If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a
-! module, then only predefined memory allocator parameters can be used in the allocator clause
+! If list items within the ALLOCATE directive have the SAVE attribute, are a
+! common block name, or are declared in the scope of a module, then only
+! predefined memory allocator parameters can be used in the allocator clause
 
 module AllocateModule
   INTEGER :: z
 end module
 
-subroutine allocate()
+subroutine allocate(custom_allocator)
 use omp_lib
 use AllocateModule
   integer, SAVE :: x
@@ -18,30 +19,25 @@ use AllocateModule
   COMMON /CommonName/ y
 
   integer(kind=omp_allocator_handle_kind) :: custom_allocator
-  integer(kind=omp_memspace_handle_kind) :: memspace
-  type(omp_alloctrait), dimension(1) :: trait
-  memspace = omp_default_mem_space
-  trait(1)%key = fallback
-  trait(1)%value = default_mem_fb
-  custom_allocator = omp_init_allocator(memspace, 1, trait)
 
   !$omp allocate(x) allocator(omp_default_mem_alloc)
+  !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block
   !$omp allocate(y) allocator(omp_default_mem_alloc)
-  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
   !$omp allocate(z) allocator(omp_default_mem_alloc)
 
+  !ERROR: If a list item is a named common block or has SAVE attribute, an ALLOCATOR clause must be present with a predefined allocator
   !$omp allocate(x)
+  !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block
   !$omp allocate(y)
-  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
   !$omp allocate(z)
 
   !$omp allocate(w) allocator(custom_allocator)
 
-  !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
   !$omp allocate(x) allocator(custom_allocator)
-  !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
+  !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block
   !$omp allocate(y) allocator(custom_allocator)
-  !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
-  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
   !$omp allocate(z) allocator(custom_allocator)
 end subroutine allocate
diff --git a/flang/test/Semantics/OpenMP/allocators04.f90 b/flang/test/Semantics/OpenMP/allocators04.f90
index c71c7ca..212e48f 100644
--- a/flang/test/Semantics/OpenMP/allocators04.f90
+++ b/flang/test/Semantics/OpenMP/allocators04.f90
@@ -22,12 +22,10 @@ subroutine allocate()
     trait(1)%value = default_mem_fb
     custom_allocator = omp_init_allocator(omp_default_mem_space, 1, trait)
 
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(omp_default_mem_alloc: a)
         allocate(a)
 
     !ERROR: If list items within the ALLOCATORS directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(custom_allocator: b)
         allocate(b)
 end subroutine
diff --git a/flang/test/Semantics/OpenMP/allocators06.f90 b/flang/test/Semantics/OpenMP/allocators06.f90
deleted file mode 100644
index 8e63512..0000000
--- a/flang/test/Semantics/OpenMP/allocators06.f90
+++ /dev/null
@@ -1,18 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50
-! OpenMP Version 5.2
-! Inherited from 2.11.3 allocate directive
-! The allocate directive must appear in the same scope as the declarations of
-! each of its list items and must follow all such declarations.
-
-subroutine allocate()
-    use omp_lib
-    integer, allocatable :: a
-contains
-    subroutine test()
-        !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
-        !$omp allocators allocate(omp_default_mem_alloc: a)
-            allocate(a)
-    end subroutine
-end subroutine
diff --git a/flang/test/Semantics/OpenMP/declarative-directive02.f90 b/flang/test/Semantics/OpenMP/declarative-directive02.f90
index dcde963..04b8c3d 100644
--- a/flang/test/Semantics/OpenMP/declarative-directive02.f90
+++ b/flang/test/Semantics/OpenMP/declarative-directive02.f90
@@ -9,7 +9,7 @@ subroutine test_decl
   implicit none
   save :: x1, y1
   !$omp threadprivate(x1)
-  !$omp allocate(y1)
+  !$omp allocate(y1) allocator(0)
   integer :: x1, y1
 
   ! OMPv5.2 7.7 declare-simd
@@ -33,12 +33,12 @@ end subroutine
 subroutine test_decl2
   save x1, y1
   !$omp threadprivate(x1)
-  !$omp allocate(y1)
+  !$omp allocate(y1) allocator(0)
   integer :: x1, y1
 
   ! implicit decl
   !$omp threadprivate(x2)
-  !$omp allocate(y2)
+  !$omp allocate(y2) allocator(0)
   save x2, y2
 end subroutine
 
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index 9178b0a..df6568d 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -36,6 +36,12 @@ module m
     if (i .le. N) a(i) = m(i)
   end subroutine
 
+  attributes(device) function devfct(r1, r2) result(res)
+    real(4), intent(in) :: r1(3), r2(3)
+    real(4) :: res(3)
+    res = r1 - r2 ! Do not error on function result
+  end function
+
   attributes(global) subroutine hostparameter(a)
     integer :: a(*)
     i = threadIdx%x
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
index 32b0a1d..67d07ee 100644
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -50,6 +50,7 @@ int main(int argc, char **argv) {
 #endif
   DialectRegistry registry;
   fir::support::registerDialects(registry);
+  registry.insert<mlir::memref::MemRefDialect>();
   fir::support::addFIRExtensions(registry);
   return failed(MlirOptMain(argc, argv, "FIR modular optimizer driver\n",
       registry));
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 09f169b..a277690 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -776,6 +776,7 @@ add_header_macro(
   ../libc/include/nl_types.yaml
   nl_types.h
   DEPENDS
+    .llvm-libc-macros.nl_types_macros
     .llvm-libc-types.nl_catd
   )
 
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index 76c03d9..b16337c 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -32,6 +32,12 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
 
 add_macro_header(
+  annex_k_macros
+  HDR
+    annex-k-macros.h
+)
+
+add_macro_header(
   assert_macros
   HDR
     assert-macros.h
@@ -346,6 +352,12 @@ add_macro_header(
 )
 
 add_macro_header(
+  nl_types_macros
+  HDR
+    nl-types-macros.h
+)
+
+add_macro_header(
   pthread_macros
   HDR
     pthread-macros.h
diff --git a/libc/include/llvm-libc-macros/annex-k-macros.h b/libc/include/llvm-libc-macros/annex-k-macros.h
new file mode 100644
index 0000000..7cfb5c1
--- /dev/null
+++ b/libc/include/llvm-libc-macros/annex-k-macros.h
@@ -0,0 +1,27 @@
+//===-- Definition of macros to be used with Annex K functions ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_INCLUDE_LLVM_LIBC_MACROS_ANNEX_K_MACROS_H
+#define LLVM_LIBC_INCLUDE_LLVM_LIBC_MACROS_ANNEX_K_MACROS_H
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201703L)
+
+// TODO(bassiounix): Who should def this macro (clang vs libc)? Where?
+// TODO(bassiounix): uncomment/move when Annex K is fully implemented.
+// #define __STDC_LIB_EXT1__ 201112L
+
+#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ == 1
+
+#define LIBC_HAS_ANNEX_K
+
+#endif // defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ == 1
+
+#endif // (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||
+       // (defined(__cplusplus) && __cplusplus >= 201703L)
+#endif // LLVM_LIBC_INCLUDE_LLVM_LIBC_MACROS_ANNEX_K_MACROS_H
diff --git a/libc/include/llvm-libc-macros/nl-types-macros.h b/libc/include/llvm-libc-macros/nl-types-macros.h
new file mode 100644
index 0000000..b6d0d35
--- /dev/null
+++ b/libc/include/llvm-libc-macros/nl-types-macros.h
@@ -0,0 +1,15 @@
+//===-- Definition of macros from nl_types.h ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_MACROS_NL_TYPES_MACROS_H
+#define LLVM_LIBC_MACROS_NL_TYPES_MACROS_H
+
+#define NL_SETD 1
+#define NL_CAT_LOCALE 1
+
+#endif // LLVM_LIBC_MACROS_NL_TYPES_MACROS_H
diff --git a/libc/include/nl_types.yaml b/libc/include/nl_types.yaml
index aecbb44..bdb59a8 100644
--- a/libc/include/nl_types.yaml
+++ b/libc/include/nl_types.yaml
@@ -1,7 +1,11 @@
 header: nl_types.h
 standards:
   - posix
-macros: []
+macros:
+  - macro_name: NL_SETD
+    macro_header: nl-types-macros.h
+  - macro_name: NL_CAT_LOCALE
+    macro_header: nl-types-macros.h
 types:
   - type_name: nl_catd
 enums: []
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 8178091..b8a0a74 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -4,6 +4,7 @@ macros:
   - macro_name: NULL
     macro_header: null-macro.h
 types:
+  - type_name: FILE
   - type_name: size_t
   - type_name: wint_t
   - type_name: wchar_t
@@ -104,9 +105,9 @@ functions:
   - name: wmemset
     standards:
       - stdc
-    return_type: wchar_t*
+    return_type: wchar_t *
     arguments:
-      - type: wchar_t*
+      - type: wchar_t *
       - type: wchar_t
       - type: size_t
   - name: wcschr
@@ -246,7 +247,7 @@ functions:
       - type: const wchar_t **__restrict
       - type: size_t
       - type: size_t
-      - type: mbstate_t
+      - type: mbstate_t *__restrict
   - name: wcsrtombs
     standards:
       - stdc
@@ -255,7 +256,7 @@ functions:
       - type: char *__restrict
       - type: const wchar_t **__restrict
       - type: size_t
-      - type: mbstate_t
+      - type: mbstate_t *__restrict
   - name: wcrtomb
     standards:
       - stdc
@@ -299,7 +300,7 @@ functions:
     arguments:
       - type: wchar_t *__restrict
       - type: const wchar_t *__restrict
-      - type: wchar_t** __restrict
+      - type: wchar_t **__restrict
   - name: wcpcpy
     standards:
       - stdc
diff --git a/libc/src/wchar/wcsnrtombs.cpp b/libc/src/wchar/wcsnrtombs.cpp
index 7f25b24..a344c23 100644
--- a/libc/src/wchar/wcsnrtombs.cpp
+++ b/libc/src/wchar/wcsnrtombs.cpp
@@ -22,7 +22,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(size_t, wcsnrtombs,
                    (char *__restrict s, const wchar_t **__restrict pwcs,
-                    size_t nwc, size_t len, mbstate_t *ps)) {
+                    size_t nwc, size_t len, mbstate_t *__restrict ps)) {
   LIBC_CRASH_ON_NULLPTR(pwcs);
   static internal::mbstate internal_mbstate;
   auto result = internal::wcsnrtombs(
diff --git a/libc/src/wchar/wcsnrtombs.h b/libc/src/wchar/wcsnrtombs.h
index bf8add7..2ca42c7 100644
--- a/libc/src/wchar/wcsnrtombs.h
+++ b/libc/src/wchar/wcsnrtombs.h
@@ -17,7 +17,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 size_t wcsnrtombs(char *__restrict s, const wchar_t **__restrict pwcs,
-                  size_t nwc, size_t len, mbstate_t *ps);
+                  size_t nwc, size_t len, mbstate_t *__restrict ps);
 
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/wchar/wcsrtombs.cpp b/libc/src/wchar/wcsrtombs.cpp
index 9d2508c..0167e85 100644
--- a/libc/src/wchar/wcsrtombs.cpp
+++ b/libc/src/wchar/wcsrtombs.cpp
@@ -22,7 +22,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(size_t, wcsrtombs,
                    (char *__restrict s, const wchar_t **__restrict pwcs,
-                    size_t n, mbstate_t *ps)) {
+                    size_t n, mbstate_t *__restrict ps)) {
   LIBC_CRASH_ON_NULLPTR(pwcs);
   static internal::mbstate internal_mbstate;
   auto result = internal::wcsnrtombs(
diff --git a/libc/src/wchar/wcsrtombs.h b/libc/src/wchar/wcsrtombs.h
index d23573f..b85e2c6 100644
--- a/libc/src/wchar/wcsrtombs.h
+++ b/libc/src/wchar/wcsrtombs.h
@@ -17,7 +17,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 size_t wcsrtombs(char *__restrict s, const wchar_t **__restrict pwcs, size_t n,
-                 mbstate_t *ps);
+                 mbstate_t *__restrict ps);
 
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/test/src/nl_types/CMakeLists.txt b/libc/test/src/nl_types/CMakeLists.txt
index 4fce637..6bafb32 100644
--- a/libc/test/src/nl_types/CMakeLists.txt
+++ b/libc/test/src/nl_types/CMakeLists.txt
@@ -7,6 +7,7 @@ add_libc_test(
   SRCS
     nl_types_test.cpp
   DEPENDS
+    libc.include.llvm-libc-macros.nl_types_macros
     libc.include.llvm-libc-types.nl_catd
     libc.src.nl_types.catopen
     libc.src.nl_types.catclose
diff --git a/libc/test/src/nl_types/nl_types_test.cpp b/libc/test/src/nl_types/nl_types_test.cpp
index 5ae5c5a..7392200 100644
--- a/libc/test/src/nl_types/nl_types_test.cpp
+++ b/libc/test/src/nl_types/nl_types_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/nl-types-macros.h"
 #include "include/llvm-libc-types/nl_catd.h"
 #include "src/nl_types/catclose.h"
 #include "src/nl_types/catgets.h"
@@ -15,7 +16,7 @@
 using LlvmLibcNlTypesTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
 TEST_F(LlvmLibcNlTypesTest, CatopenFails) {
-  ASSERT_EQ(LIBC_NAMESPACE::catopen("/somepath", 0),
+  ASSERT_EQ(LIBC_NAMESPACE::catopen("/somepath", NL_CAT_LOCALE),
             reinterpret_cast<nl_catd>(-1));
   ASSERT_ERRNO_EQ(EINVAL);
 }
@@ -28,6 +29,6 @@ TEST_F(LlvmLibcNlTypesTest, CatgetsFails) {
   const char *message = "message";
   // Note that we test for pointer equality here, since catgets
   // is expected to return the input argument as-is.
-  ASSERT_EQ(LIBC_NAMESPACE::catgets(nullptr, 0, 0, message),
+  ASSERT_EQ(LIBC_NAMESPACE::catgets(nullptr, NL_SETD, 1, message),
             const_cast<char *>(message));
 }
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index d8c2219..3228926 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -392,7 +392,7 @@ function(add_libclc_builtin_set)
   list( PREPEND bytecode_files ${bytecode_ir_files} )
 
   if( NOT bytecode_files )
-    message(FATAL_ERROR "Cannot create an empty builtins library")
+    message(FATAL_ERROR "Cannot create an empty builtins library for ${ARG_ARCH_SUFFIX}")
   endif()
 
   set( builtins_link_lib_tgt builtins.link.${ARG_ARCH_SUFFIX} )
diff --git a/libcxx/include/__cxx03/__algorithm/count.h b/libcxx/include/__cxx03/__algorithm/count.h
index 5440fd0..5b05b4b 100644
--- a/libcxx/include/__cxx03/__algorithm/count.h
+++ b/libcxx/include/__cxx03/__algorithm/count.h
@@ -54,18 +54,18 @@ __count_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
   if (__first.__ctz_ != 0) {
     __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
     __storage_type __dn    = std::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    __r                    = std::__libcpp_popcount(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
+    __storage_type __m     = (__storage_type(~0) << __first.__ctz_) & (__storage_type(~0) >> (__clz_f - __dn));
+    __r                    = std::__libcpp_popcount<__storage_type>(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
     __n -= __dn;
     ++__first.__seg_;
   }
   // do middle whole words
   for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-    __r += std::__libcpp_popcount(std::__invert_if<!_ToCount>(*__first.__seg_));
+    __r += std::__libcpp_popcount<__storage_type>(std::__invert_if<!_ToCount>(*__first.__seg_));
   // do last partial word
   if (__n > 0) {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    __r += std::__libcpp_popcount(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
+    __storage_type __m = __storage_type(~0) >> (__bits_per_word - __n);
+    __r += std::__libcpp_popcount<__storage_type>(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
   }
   return __r;
 }
diff --git a/libcxx/include/__cxx03/__algorithm/for_each.h b/libcxx/include/__cxx03/__algorithm/for_each.h
index d160a9e..1ffb013 100644
--- a/libcxx/include/__cxx03/__algorithm/for_each.h
+++ b/libcxx/include/__cxx03/__algorithm/for_each.h
@@ -14,15 +14,11 @@
 #include <__cxx03/__config>
 #include <__cxx03/__iterator/segmented_iterator.h>
 #include <__cxx03/__type_traits/enable_if.h>
-#include <__cxx03/__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
-_LIBCPP_PUSH_MACROS
-#include <__cxx03/__undef_macros>
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Function>
@@ -34,6 +30,4 @@ _LIBCPP_HIDE_FROM_ABI _Function for_each(_InputIterator __first, _InputIterator
 
 _LIBCPP_END_NAMESPACE_STD
 
-_LIBCPP_POP_MACROS
-
 #endif // _LIBCPP___CXX03___ALGORITHM_FOR_EACH_H
diff --git a/libcxx/include/__cxx03/__bit/popcount.h b/libcxx/include/__cxx03/__bit/popcount.h
index 64404d2..a61a921 100644
--- a/libcxx/include/__cxx03/__bit/popcount.h
+++ b/libcxx/include/__cxx03/__bit/popcount.h
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: __builtin_popcountg is available since Clang 19 and GCC 14. When support for older versions is dropped, we can
-//  refactor this code to exclusively use __builtin_popcountg.
-
 #ifndef _LIBCPP___CXX03___BIT_POPCOUNT_H
 #define _LIBCPP___CXX03___BIT_POPCOUNT_H
 
@@ -25,12 +22,9 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned __x) _NOEXCEPT { return __builtin_popcount(__x); }
-
-inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned long __x) _NOEXCEPT { return __builtin_popcountl(__x); }
-
-inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned long long __x) _NOEXCEPT {
-  return __builtin_popcountll(__x);
+template <class _Tp>
+_LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(_Tp __v) {
+  return __builtin_popcountg(__v);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
index e696dcd..ffe3e0e 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
@@ -15,7 +15,6 @@
 
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp
index 3e4c5b1..c9a0fa1 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp
@@ -14,8 +14,6 @@
 
 // unordered_map& operator=(const unordered_map& u);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
@@ -270,7 +268,7 @@ void test_alloc(const Alloc& lhs_alloc                   = Alloc(),
       V rhs_arr[] = {V(10, 4), V(13, 5), V(12, 324), V(0, 54), V(50, 5), V(2, 5)};
       Map copy(begin(rhs_arr), end(rhs_arr), 0, std::hash<int>(), std::equal_to<int>(), rhs_alloc);
       copy = orig;
-      LIBCPP_ASSERT(copy.bucket_count() == 5);
+      LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5);
       assert(copy.size() == 4);
       assert(copy.at(1) == 1);
       assert(copy.at(2) == 3);
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp
index 938b6be..beb67d8 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp
@@ -14,8 +14,6 @@
 
 // unordered_multimap& operator=(const unordered_multimap& u);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
@@ -289,7 +287,7 @@ void test_alloc(const Alloc& lhs_alloc                   = Alloc(),
       V rhs_arr[] = {V(10, 4), V(13, 5), V(12, 324), V(0, 54), V(50, 5), V(2, 5)};
       Map copy(begin(rhs_arr), end(rhs_arr), 0, std::hash<int>(), std::equal_to<int>(), rhs_alloc);
       copy = orig;
-      LIBCPP_ASSERT(copy.bucket_count() == 5);
+      LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5);
       assert(copy.size() == 4);
       assert(copy.find(1)->second == 1);
       assert(copy.find(2)->second == 3);
diff --git a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp
index e415253..5c85676 100644
--- a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp
@@ -14,8 +14,6 @@
 
 // unordered_multiset& operator=(const unordered_multiset& u);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
@@ -259,7 +257,7 @@ void test_alloc(const Alloc& lhs_alloc                   = Alloc(),
       int rhs_arr[] = {10, 13, 12, 0, 50, 2};
       Set copy(std::begin(rhs_arr), std::end(rhs_arr), 0, std::hash<int>(), std::equal_to<int>(), rhs_alloc);
       copy = orig;
-      LIBCPP_ASSERT(copy.bucket_count() == 5);
+      LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5);
       assert(copy.size() == 4);
       assert(copy.count(1) == 1);
       assert(copy.count(2) == 1);
diff --git a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp
index 9828b8b..30e12e2 100644
--- a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp
@@ -14,8 +14,6 @@
 
 // unordered_set& operator=(const unordered_set& u);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
@@ -262,7 +260,7 @@ void test_alloc(const Alloc& lhs_alloc                   = Alloc(),
       int rhs_arr[] = {10, 13, 12, 0, 50, 2};
       Set copy(std::begin(rhs_arr), std::end(rhs_arr), 0, std::hash<int>(), std::equal_to<int>(), rhs_alloc);
       copy = orig;
-      LIBCPP_ASSERT(copy.bucket_count() == 5);
+      LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5);
       assert(copy.size() == 4);
       assert(copy.count(1) == 1);
       assert(copy.count(2) == 1);
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
index 572a14e..fed5b4a 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
@@ -12,8 +12,6 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, void* v) const;
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <cassert>
 #include <ios>
 #include <locale>
@@ -36,7 +34,7 @@ int main(int, char**) {
     cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), ios, '*', v);
     std::string ex(str, base(iter));
     assert(!ex.empty());
-    LIBCPP_ASSERT(ex == "0");
+    LIBCPP_NON_FROZEN_ASSERT(ex == "0");
   }
 
   return 0;
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 2fc25fc..c4e1600 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -264,6 +264,12 @@
 #define LIBCPP_ONLY(...) static_assert(true, "")
 #endif
 
+#ifdef _LIBCPP_USE_FROZEN_CXX03_HEADERS
+#  define LIBCPP_NON_FROZEN_ASSERT(...) static_assert(true, "")
+#else
+#  define LIBCPP_NON_FROZEN_ASSERT(...) LIBCPP_ASSERT(__VA_ARGS__)
+#endif
+
 #if __has_cpp_attribute(nodiscard)
 #  define TEST_NODISCARD [[nodiscard]]
 #else
diff --git a/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll b/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll
index 7e29044..f57a3e3 100644
--- a/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll
+++ b/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll
@@ -47,8 +47,8 @@ entry:
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { nounwind sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !llvm.linker.options = !{!1, !2}
diff --git a/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll b/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll
index 0f64e23..7347fde 100644
--- a/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll
+++ b/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll
@@ -11,7 +11,7 @@ entry:
   ret void
 }
 
-attributes #0 = { norecurse nounwind readnone sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readnone sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.linker.options = !{!0, !1}
 !llvm.module.flags = !{!2, !3, !4, !5}
diff --git a/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll b/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll
index 5f67302..61828cae 100644
--- a/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll
+++ b/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll
@@ -13,8 +13,8 @@ declare dso_local i32 @"?foo@@YAHXZ"() #1
 
 declare dso_local i32 @"?bar@@YAHXZ"() #1
 
-attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/lld/test/MachO/lto-mattrs.ll b/lld/test/MachO/lto-mattrs.ll
index f658b48..4131329 100644
--- a/lld/test/MachO/lto-mattrs.ll
+++ b/lld/test/MachO/lto-mattrs.ll
@@ -33,4 +33,4 @@ define float @foo(float %x) #0 {
   ret float %div
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
+attributes #0 = { "reciprocal-estimates"="divf,vec-divf" }
diff --git a/lld/test/wasm/Inputs/debuginfo1.ll b/lld/test/wasm/Inputs/debuginfo1.ll
index d6db880..0a1c42a 100644
--- a/lld/test/wasm/Inputs/debuginfo1.ll
+++ b/lld/test/wasm/Inputs/debuginfo1.ll
@@ -35,9 +35,9 @@ declare void @foo(i32) local_unnamed_addr #2
 ; Function Attrs: nounwind readnone speculatable
 declare void @llvm.dbg.value(metadata, metadata, metadata) #3
 
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
+attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
 attributes #3 = { nounwind readnone speculatable }
 attributes #4 = { nounwind }
 
diff --git a/lld/test/wasm/Inputs/debuginfo2.ll b/lld/test/wasm/Inputs/debuginfo2.ll
index 1b63dd5..c832be5 100644
--- a/lld/test/wasm/Inputs/debuginfo2.ll
+++ b/lld/test/wasm/Inputs/debuginfo2.ll
@@ -31,7 +31,7 @@ entry:
 ; Function Attrs: nounwind readnone speculatable
 declare void @llvm.dbg.value(metadata, metadata, metadata) #1
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone speculatable }
 
 !llvm.dbg.cu = !{!2}
diff --git a/lld/test/wasm/debug-removed-fn.ll b/lld/test/wasm/debug-removed-fn.ll
index 8dae48a..20c3034 100644
--- a/lld/test/wasm/debug-removed-fn.ll
+++ b/lld/test/wasm/debug-removed-fn.ll
@@ -28,7 +28,7 @@ entry:
   ret i32 6, !dbg !13
 }
 
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 6b121c9..9900745 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -754,7 +754,6 @@ ClangExpressionParser::ClangExpressionParser(
   // Make sure clang uses the same VFS as LLDB.
   m_compiler->setVirtualFileSystem(
       FileSystem::Instance().GetVirtualFileSystem());
-  m_compiler->createFileManager();
 
   // 2. Configure the compiler with a set of default options that are
   // appropriate for most situations.
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 91f3a6c..b4422a7 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -86,6 +86,7 @@
 #include "lldb/Host/Host.h"
 #include "lldb/Utility/StringExtractorGDBRemote.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -2762,6 +2763,108 @@ size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size,
   return 0;
 }
 
+llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>
+ProcessGDBRemote::ReadMemoryRanges(
+    llvm::ArrayRef<Range<lldb::addr_t, size_t>> ranges,
+    llvm::MutableArrayRef<uint8_t> buffer) {
+  if (!m_gdb_comm.GetMultiMemReadSupported())
+    return Process::ReadMemoryRanges(ranges, buffer);
+
+  llvm::Expected<StringExtractorGDBRemote> response =
+      SendMultiMemReadPacket(ranges);
+  if (!response) {
+    LLDB_LOG_ERROR(GetLog(GDBRLog::Process), response.takeError(),
+                   "MultiMemRead error response: {0}");
+    return Process::ReadMemoryRanges(ranges, buffer);
+  }
+
+  llvm::StringRef response_str = response->GetStringRef();
+  const unsigned expected_num_ranges = ranges.size();
+  llvm::Expected<llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>>
+      parsed_response =
+          ParseMultiMemReadPacket(response_str, buffer, expected_num_ranges);
+  if (!parsed_response) {
+    LLDB_LOG_ERROR(GetLog(GDBRLog::Process), parsed_response.takeError(),
+                   "MultiMemRead error parsing response: {0}");
+    return Process::ReadMemoryRanges(ranges, buffer);
+  }
+  return std::move(*parsed_response);
+}
+
+llvm::Expected<StringExtractorGDBRemote>
+ProcessGDBRemote::SendMultiMemReadPacket(
+    llvm::ArrayRef<Range<lldb::addr_t, size_t>> ranges) {
+  std::string packet_str;
+  llvm::raw_string_ostream stream(packet_str);
+  stream << "MultiMemRead:ranges:";
+
+  auto range_to_stream = [&](auto range) {
+    // the "-" marker omits the '0x' prefix.
+    stream << llvm::formatv("{0:x-},{1:x-}", range.base, range.size);
+  };
+  llvm::interleave(ranges, stream, range_to_stream, ",");
+  stream << ";";
+
+  StringExtractorGDBRemote response;
+  GDBRemoteCommunication::PacketResult packet_result =
+      m_gdb_comm.SendPacketAndWaitForResponse(packet_str.data(), response,
+                                              GetInterruptTimeout());
+  if (packet_result != GDBRemoteCommunication::PacketResult::Success)
+    return llvm::createStringError(
+        llvm::formatv("MultiMemRead failed to send packet: '{0}'", packet_str));
+
+  if (response.IsErrorResponse())
+    return llvm::createStringError(
+        llvm::formatv("MultiMemRead failed: '{0}'", response.GetStringRef()));
+
+  if (!response.IsNormalResponse())
+    return llvm::createStringError(llvm::formatv(
+        "MultiMemRead unexpected response: '{0}'", response.GetStringRef()));
+
+  return response;
+}
+
+llvm::Expected<llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>>
+ProcessGDBRemote::ParseMultiMemReadPacket(llvm::StringRef response_str,
+                                          llvm::MutableArrayRef<uint8_t> buffer,
+                                          unsigned expected_num_ranges) {
+  // The sizes and the data are separated by a `;`.
+  auto [sizes_str, memory_data] = response_str.split(';');
+  if (sizes_str.size() == response_str.size())
+    return llvm::createStringError(llvm::formatv(
+        "MultiMemRead response missing field separator ';' in: '{0}'",
+        response_str));
+
+  llvm::SmallVector<llvm::MutableArrayRef<uint8_t>> read_results;
+
+  // Sizes are separated by a `,`.
+  for (llvm::StringRef size_str : llvm::split(sizes_str, ',')) {
+    uint64_t read_size;
+    if (size_str.getAsInteger(16, read_size))
+      return llvm::createStringError(llvm::formatv(
+          "MultiMemRead response has invalid size string: {0}", size_str));
+
+    if (memory_data.size() < read_size)
+      return llvm::createStringError(
+          llvm::formatv("MultiMemRead response did not have enough data, "
+                        "requested sizes: {0}",
+                        sizes_str));
+
+    llvm::StringRef region_to_read = memory_data.take_front(read_size);
+    memory_data = memory_data.drop_front(read_size);
+
+    assert(buffer.size() >= read_size);
+    llvm::MutableArrayRef<uint8_t> region_to_write =
+        buffer.take_front(read_size);
+    buffer = buffer.drop_front(read_size);
+
+    memcpy(region_to_write.data(), region_to_read.data(), read_size);
+    read_results.push_back(region_to_write);
+  }
+
+  return read_results;
+}
+
 bool ProcessGDBRemote::SupportsMemoryTagging() {
   return m_gdb_comm.GetMemoryTaggingSupported();
 }
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index 7c3dfb1..eb33b52 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -137,6 +137,22 @@ public:
   size_t DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
                       Status &error) override;
 
+  /// Override of ReadMemoryRanges that uses MultiMemRead to optimize this
+  /// operation.
+  llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>
+  ReadMemoryRanges(llvm::ArrayRef<Range<lldb::addr_t, size_t>> ranges,
+                   llvm::MutableArrayRef<uint8_t> buf) override;
+
+private:
+  llvm::Expected<StringExtractorGDBRemote>
+  SendMultiMemReadPacket(llvm::ArrayRef<Range<lldb::addr_t, size_t>> ranges);
+
+  llvm::Expected<llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>>
+  ParseMultiMemReadPacket(llvm::StringRef response_str,
+                          llvm::MutableArrayRef<uint8_t> buffer,
+                          unsigned expected_num_ranges);
+
+public:
   Status
   WriteObjectFile(std::vector<ObjectFile::LoadableData> entries) override;
 
diff --git a/lldb/test/API/CMakeLists.txt b/lldb/test/API/CMakeLists.txt
index e3bffbc..c719ac3 100644
--- a/lldb/test/API/CMakeLists.txt
+++ b/lldb/test/API/CMakeLists.txt
@@ -74,6 +74,16 @@ else()
   endif()
 endif()
 
+find_program(LLDB_DIRNAME_PATH dirname)
+if(LLDB_DIRNAME_PATH)
+  message(STATUS "Found dirname: ${LLDB_DIRNAME_PATH}")
+else()
+  message(STATUS "Could NOT find 'dirname'")
+  message(WARNING
+        "Many LLDB API tests require the GNU coreutils tools. Please make "
+        "sure they are installed and in PATH.")
+endif()
+
 if (TARGET clang)
   set(LLDB_DEFAULT_TEST_COMPILER "${LLVM_TOOLS_BINARY_DIR}/clang${CMAKE_EXECUTABLE_SUFFIX}")
 else()
diff --git a/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py b/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py
index f5d3da53..c95a57f 100644
--- a/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py
+++ b/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py
@@ -13,12 +13,31 @@ class FinishFromEmptyFunctionTestCase(TestBase):
 
     @skipIf(compiler="clang", compiler_version=['<', '17.0'])
     def test_finish_from_empty_function(self):
-        """Test that when stopped at a breakpoint in an empty function, finish leaves it correctly."""
+        """Test that when stopped at a breakpoint located at the last instruction
+        of a function, finish leaves it correctly."""
         self.build()
-        exe = self.getBuildArtifact("a.out")
-        target, process, thread, _ = lldbutil.run_to_name_breakpoint(
-            self, "done", exe_name=exe
+        target, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "// Set breakpoint here", lldb.SBFileSpec("main.c")
         )
+        # Find the address of the last instruction of 'done()' and set a breakpoint there.
+        # Even though 'done()' is empty, it may contain prologue and epilogue code, so
+        # simply setting a breakpoint at the function can place it before 'ret'.
+        error = lldb.SBError()
+        ret_bp_addr = lldb.SBAddress()
+        while True:
+            thread.StepInstruction(False, error)
+            self.assertTrue(error.Success())
+            frame = thread.GetSelectedFrame()
+            if "done" in frame.GetFunctionName():
+                ret_bp_addr = frame.GetPCAddress()
+            elif ret_bp_addr.IsValid():
+                # The entire function 'done()' has been stepped through, so 'ret_bp_addr'
+                # now contains the address of its last instruction, i.e. 'ret'.
+                break
+        ret_bp = target.BreakpointCreateByAddress(ret_bp_addr.GetLoadAddress(target))
+        self.assertTrue(ret_bp.IsValid())
+        # Resume the execution and hit the new breakpoint.
+        self.runCmd("cont")
         if self.TraceOn():
             self.runCmd("bt")
 
@@ -29,7 +48,6 @@ class FinishFromEmptyFunctionTestCase(TestBase):
         )
         self.assertTrue(safety_bp.IsValid())
 
-        error = lldb.SBError()
         thread.StepOut(error)
         self.assertTrue(error.Success())
 
diff --git a/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c b/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c
index bc66a548..b3f90db5 100644
--- a/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c
+++ b/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c
@@ -2,6 +2,7 @@
 void done() {}
 int main() {
   puts("in main");
+  done(); // Set breakpoint here
   done();
   puts("leaving main");
   return 0;
diff --git a/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py b/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py
index a14035d..a9fbe54 100644
--- a/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py
+++ b/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py
@@ -45,3 +45,30 @@ class FoundationTestCaseNSError(TestBase):
             ],
         )
         self.runCmd("process continue")
+
+    @skipIfOutOfTreeDebugserver
+    def test_runtime_types_efficient_memreads(self):
+        # Test that we use an efficient reading of memory when reading
+        # Objective-C method descriptions.
+        logfile = os.path.join(self.getBuildDir(), "log.txt")
+        self.runCmd(f"log enable -f {logfile} gdb-remote packets process")
+        self.addTearDownHook(lambda: self.runCmd("log disable gdb-remote packets"))
+
+        self.build()
+        self.target, process, thread, bkpt = lldbutil.run_to_source_breakpoint(
+            self, "// Break here for NSString tests", lldb.SBFileSpec("main.m", False)
+        )
+
+        self.runCmd(f"proc plugin packet send StartTesting", check=False)
+        self.expect('expression str = [NSString stringWithCString: "new"]')
+        self.runCmd(f"proc plugin packet send EndTesting", check=False)
+
+        self.assertTrue(os.path.exists(logfile))
+        log_text = open(logfile).read()
+        log_text = log_text.split("StartTesting", 1)[-1].split("EndTesting", 1)[0]
+
+        # This test is only checking that the packet it used at all (and that
+        # no errors are produced). It doesn't check that the packet is being
+        # used to solve a problem in an optimal way.
+        self.assertIn("MultiMemRead:", log_text)
+        self.assertNotIn("MultiMemRead error", log_text)
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 52c8c6b..3c4f225 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -26,6 +26,7 @@
 #include "lldb/API/SBEvent.h"
 #include "lldb/API/SBLanguageRuntime.h"
 #include "lldb/API/SBListener.h"
+#include "lldb/API/SBMutex.h"
 #include "lldb/API/SBProcess.h"
 #include "lldb/API/SBStream.h"
 #include "lldb/Host/JSONTransport.h"
@@ -1452,7 +1453,11 @@ void DAP::EventThread() {
           const bool remove_module =
               event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded;
 
-          std::lock_guard<std::mutex> guard(modules_mutex);
+          // NOTE: Both mutexes must be acquired to prevent deadlock when
+          // handling `modules_request`, which also requires both locks.
+          lldb::SBMutex api_mutex = GetAPIMutex();
+          const std::scoped_lock<lldb::SBMutex, std::mutex> guard(
+              api_mutex, modules_mutex);
           for (uint32_t i = 0; i < num_modules; ++i) {
             lldb::SBModule module =
                 lldb::SBTarget::GetModuleAtIndexFromEvent(i, event);
diff --git a/lldb/unittests/Target/MemoryTest.cpp b/lldb/unittests/Target/MemoryTest.cpp
index f7b4e97..e444f68 100644
--- a/lldb/unittests/Target/MemoryTest.cpp
+++ b/lldb/unittests/Target/MemoryTest.cpp
@@ -245,7 +245,7 @@ public:
     if (read_more_than_requested)
       size *= 2;
     uint8_t *buffer = static_cast<uint8_t *>(buf);
-    for (size_t addr = vm_addr; addr < vm_addr + size; addr++)
+    for (lldb::addr_t addr = vm_addr; addr < vm_addr + size; addr++)
       buffer[addr - vm_addr] = static_cast<uint8_t>(addr); // LSB of addr.
     return size;
   }
diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst
index 8677d89..63f6663 100644
--- a/llvm/docs/CodingStandards.rst
+++ b/llvm/docs/CodingStandards.rst
@@ -1692,29 +1692,29 @@ faraway places in the file to tell that the function is local:
 Don't Use Braces on Simple Single-Statement Bodies of if/else/loop Statements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-When writing the body of an ``if``, ``else``, or for/while loop statement, we
-prefer to omit the braces to avoid unnecessary line noise. However, braces
-should be used in cases where the omission of braces harms the readability and
-maintainability of the code.
+When writing the body of an ``if``, ``else``, or ``for``/``while`` loop
+statement, we aim to reduce unnecessary line noise.
 
-We consider that readability is harmed when omitting the brace in the presence
-of a single statement that is accompanied by a comment (assuming the comment
-can't be hoisted above the ``if`` or loop statement, see below).
+**Omit braces when:**
 
-Similarly, braces should be used when a single-statement body is complex enough
-that it becomes difficult to see where the block containing the following
-statement began. An ``if``/``else`` chain or a loop is considered a single
-statement for this rule, and this rule applies recursively.
+*   The body consists of a single **simple** statement.
+*   The single statement is not preceded by a comment.
+    (Hoist comments above the control statement if you can.)
+*   An ``else`` clause, if present, also meets the above criteria (single
+    simple statement, no associated comments).
 
-This list is not exhaustive. For example, readability is also harmed if an
-``if``/``else`` chain does not use braced bodies for either all or none of its
-members, or has complex conditionals, deep nesting, etc. The examples below
-intend to provide some guidelines.
+**Use braces in all other cases, including:**
 
-Maintainability is harmed if the body of an ``if`` ends with a (directly or
-indirectly) nested ``if`` statement with no ``else``. Braces on the outer ``if``
-would help to avoid running into a "dangling else" situation.
+*   Multi-statement bodies
+*   Single-statement bodies with non-hoistable comments
+*   Complex single-statement bodies (e.g., deep nesting, complex nested
+    loops)
+*   Inconsistent bracing within ``if``/``else if``/``else`` chains (if one
+    block requires braces, all must)
+*   ``if`` statements ending with a nested ``if`` lacking an ``else`` (to
+    prevent "dangling else")
 
+The examples below provide guidelines for these cases:
 
 .. code-block:: c++
 
diff --git a/llvm/docs/CommandGuide/llc.rst b/llvm/docs/CommandGuide/llc.rst
index 900649f..cc670f6 100644
--- a/llvm/docs/CommandGuide/llc.rst
+++ b/llvm/docs/CommandGuide/llc.rst
@@ -125,13 +125,6 @@ End-user Options
 
  Enable setting the FP exceptions build attribute not to use exceptions.
 
-.. option:: --enable-unsafe-fp-math
-
- Enable optimizations that make unsafe assumptions about IEEE math (e.g. that
- addition is associative) or may not work for all input ranges.  These
- optimizations allow the code generator to make use of some instructions which
- would otherwise not be usable (such as ``fsin`` on X86).
-
 .. option:: --stats
 
  Print statistics recorded by code-generation passes.
diff --git a/llvm/docs/CommandGuide/lli.rst b/llvm/docs/CommandGuide/lli.rst
index 94c0013..8afe10d 100644
--- a/llvm/docs/CommandGuide/lli.rst
+++ b/llvm/docs/CommandGuide/lli.rst
@@ -107,11 +107,6 @@ FLOATING POINT OPTIONS
 
  Enable optimizations that assume no NAN values.
 
-.. option:: -enable-unsafe-fp-math
-
- Causes :program:`lli` to enable optimizations that may decrease floating point
- precision.
-
 .. option:: -soft-float
 
  Causes :program:`lli` to generate software floating point library calls instead of
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index b055327..661a115 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -504,7 +504,7 @@ undefined.
 G_ABDS, G_ABDU
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Compute the absolute difference (signed and unsigned), e.g. abs(x-y).
+Compute the absolute difference (signed and unsigned), e.g. trunc(abs(ext(x)-ext(y)).
 
 .. code-block:: none
 
diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst
index f057b2d..12b5e3e 100644
--- a/llvm/docs/SourceLevelDebugging.rst
+++ b/llvm/docs/SourceLevelDebugging.rst
@@ -674,7 +674,7 @@ Compiled to LLVM, this function would be represented like this:
     ret void, !dbg !24
   }
 
-  attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
 
   !llvm.dbg.cu = !{!0}
diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h
index d464cbc..9555fad 100644
--- a/llvm/include/llvm/ADT/BitmaskEnum.h
+++ b/llvm/include/llvm/ADT/BitmaskEnum.h
@@ -106,7 +106,7 @@ struct is_bitmask_enum : std::false_type {};
 
 template <typename E>
 struct is_bitmask_enum<
-    E, std::enable_if_t<sizeof(E::LLVM_BITMASK_LARGEST_ENUMERATOR) >= 0>>
+    E, std::void_t<decltype(E::LLVM_BITMASK_LARGEST_ENUMERATOR)>>
     : std::true_type {};
 
 /// Trait class to determine bitmask enumeration largest bit.
@@ -114,7 +114,7 @@ template <typename E, typename Enable = void> struct largest_bitmask_enum_bit;
 
 template <typename E>
 struct largest_bitmask_enum_bit<
-    E, std::enable_if_t<sizeof(E::LLVM_BITMASK_LARGEST_ENUMERATOR) >= 0>> {
+    E, std::void_t<decltype(E::LLVM_BITMASK_LARGEST_ENUMERATOR)>> {
   using UnderlyingTy = std::underlying_type_t<E>;
   static constexpr UnderlyingTy value =
       static_cast<UnderlyingTy>(E::LLVM_BITMASK_LARGEST_ENUMERATOR);
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 3d3ec14..04ea769 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -638,8 +638,12 @@ public:
   /// \p GEP The GEP. The indices contained in the GEP itself are ignored,
   /// instead we use IndexExprs.
   /// \p IndexExprs The expressions for the indices.
-  LLVM_ABI const SCEV *
-  getGEPExpr(GEPOperator *GEP, const SmallVectorImpl<const SCEV *> &IndexExprs);
+  LLVM_ABI const SCEV *getGEPExpr(GEPOperator *GEP,
+                                  ArrayRef<const SCEV *> IndexExprs);
+  LLVM_ABI const SCEV *getGEPExpr(const SCEV *BaseExpr,
+                                  ArrayRef<const SCEV *> IndexExprs,
+                                  Type *SrcElementTy,
+                                  GEPNoWrapFlags NW = GEPNoWrapFlags::none());
   LLVM_ABI const SCEV *getAbsExpr(const SCEV *Op, bool IsNSW);
   LLVM_ABI const SCEV *getMinMaxExpr(SCEVTypes Kind,
                                      SmallVectorImpl<const SCEV *> &Operands);
diff --git a/llvm/include/llvm/AsmParser/AsmParserContext.h b/llvm/include/llvm/AsmParser/AsmParserContext.h
new file mode 100644
index 0000000..1a397486
--- /dev/null
+++ b/llvm/include/llvm/AsmParser/AsmParserContext.h
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ASMPARSER_ASMPARSERCONTEXT_H
+#define LLVM_ASMPARSER_ASMPARSERCONTEXT_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/AsmParser/FileLoc.h"
+#include "llvm/IR/Value.h"
+#include <optional>
+
+namespace llvm {
+
+/// Registry of file location information for LLVM IR constructs.
+///
+/// This class provides access to the file location information
+/// for various LLVM IR constructs. Currently, it supports Function,
+/// BasicBlock and Instruction locations.
+///
+/// When available, it can answer queries about what is at a given
+/// file location, as well as where in a file a given IR construct
+/// is.
+///
+/// This information is optionally emitted by the LLParser while
+/// it reads LLVM textual IR.
+class AsmParserContext {
+  DenseMap<Function *, FileLocRange> Functions;
+  DenseMap<BasicBlock *, FileLocRange> Blocks;
+  DenseMap<Instruction *, FileLocRange> Instructions;
+
+public:
+  std::optional<FileLocRange> getFunctionLocation(const Function *) const;
+  std::optional<FileLocRange> getBlockLocation(const BasicBlock *) const;
+  std::optional<FileLocRange> getInstructionLocation(const Instruction *) const;
+  /// Get the function at the requested location range.
+  /// If no single function occupies the queried range, or the record is
+  /// missing, a nullptr is returned.
+  Function *getFunctionAtLocation(const FileLocRange &) const;
+  /// Get the function at the requested location.
+  /// If no function occupies the queried location, or the record is missing, a
+  /// nullptr is returned.
+  Function *getFunctionAtLocation(const FileLoc &) const;
+  /// Get the block at the requested location range.
+  /// If no single block occupies the queried range, or the record is missing, a
+  /// nullptr is returned.
+  BasicBlock *getBlockAtLocation(const FileLocRange &) const;
+  /// Get the block at the requested location.
+  /// If no block occupies the queried location, or the record is missing, a
+  /// nullptr is returned.
+  BasicBlock *getBlockAtLocation(const FileLoc &) const;
+  /// Get the instruction at the requested location range.
+  /// If no single instruction occupies the queried range, or the record is
+  /// missing, a nullptr is returned.
+  Instruction *getInstructionAtLocation(const FileLocRange &) const;
+  /// Get the instruction at the requested location.
+  /// If no instruction occupies the queried location, or the record is missing,
+  /// a nullptr is returned.
+  Instruction *getInstructionAtLocation(const FileLoc &) const;
+  bool addFunctionLocation(Function *, const FileLocRange &);
+  bool addBlockLocation(BasicBlock *, const FileLocRange &);
+  bool addInstructionLocation(Instruction *, const FileLocRange &);
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/AsmParser/FileLoc.h b/llvm/include/llvm/AsmParser/FileLoc.h
new file mode 100644
index 0000000..02c1849
--- /dev/null
+++ b/llvm/include/llvm/AsmParser/FileLoc.h
@@ -0,0 +1,56 @@
+//===-- FileLoc.h ---------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ASMPARSER_FILELOC_H
+#define LLVM_ASMPARSER_FILELOC_H
+
+#include <cassert>
+#include <utility>
+
+namespace llvm {
+
+/// Struct holding Line:Column location
+struct FileLoc {
+  /// 0-based line number
+  unsigned Line;
+  /// 0-based column number
+  unsigned Col;
+
+  bool operator<=(const FileLoc &RHS) const {
+    return Line < RHS.Line || (Line == RHS.Line && Col <= RHS.Col);
+  }
+
+  bool operator<(const FileLoc &RHS) const {
+    return Line < RHS.Line || (Line == RHS.Line && Col < RHS.Col);
+  }
+
+  FileLoc(unsigned L, unsigned C) : Line(L), Col(C) {}
+  FileLoc(std::pair<unsigned, unsigned> LC) : Line(LC.first), Col(LC.second) {}
+};
+
+/// Struct holding a semiopen range [Start; End)
+struct FileLocRange {
+  FileLoc Start;
+  FileLoc End;
+
+  FileLocRange() : Start(0, 0), End(0, 0) {}
+
+  FileLocRange(FileLoc S, FileLoc E) : Start(S), End(E) {
+    assert(Start <= End);
+  }
+
+  bool contains(FileLoc L) const { return Start <= L && L < End; }
+
+  bool contains(FileLocRange LR) const {
+    return Start <= LR.Start && LR.End <= End;
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index 501a7ae..0e379e5 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -13,22 +13,25 @@
 #ifndef LLVM_ASMPARSER_LLLEXER_H
 #define LLVM_ASMPARSER_LLLEXER_H
 
-#include "LLToken.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APSInt.h"
+#include "llvm/AsmParser/LLToken.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
 #include <string>
 
 namespace llvm {
   class Type;
   class SMDiagnostic;
-  class SourceMgr;
   class LLVMContext;
 
   class LLLexer {
     const char *CurPtr;
     StringRef CurBuf;
 
+    /// The end (exclusive) of the previous token.
+    const char *PrevTokEnd = nullptr;
+
     enum class ErrorPriority {
       None,   // No error message present.
       Parser, // Errors issued by parser.
@@ -62,9 +65,7 @@ namespace llvm {
     explicit LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &,
                      LLVMContext &C);
 
-    lltok::Kind Lex() {
-      return CurKind = LexToken();
-    }
+    lltok::Kind Lex() { return CurKind = LexToken(); }
 
     typedef SMLoc LocTy;
     LocTy getLoc() const { return SMLoc::getFromPointer(TokStart); }
@@ -79,6 +80,19 @@ namespace llvm {
       IgnoreColonInIdentifiers = val;
     }
 
+    /// Get the line, column position of the start of the current token,
+    /// zero-indexed
+    std::pair<unsigned, unsigned> getTokLineColumnPos() {
+      auto LC = SM.getLineAndColumn(SMLoc::getFromPointer(TokStart));
+      return {LC.first - 1, LC.second - 1};
+    }
+    /// Get the line, column position of the end of the previous token,
+    /// zero-indexed exclusive
+    std::pair<unsigned, unsigned> getPrevTokEndLineColumnPos() {
+      auto LC = SM.getLineAndColumn(SMLoc::getFromPointer(PrevTokEnd));
+      return {LC.first - 1, LC.second - 1};
+    }
+
     // This returns true as a convenience for the parser functions that return
     // true on error.
     bool ParseError(LocTy ErrorLoc, const Twine &Msg) {
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index c01de4a..9eb31d7 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -13,8 +13,9 @@
 #ifndef LLVM_ASMPARSER_LLPARSER_H
 #define LLVM_ASMPARSER_LLPARSER_H
 
-#include "LLLexer.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/AsmParser/AsmParserContext.h"
+#include "llvm/AsmParser/LLLexer.h"
 #include "llvm/AsmParser/NumberedValues.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Attributes.h"
@@ -177,6 +178,9 @@ namespace llvm {
     // Map of module ID to path.
     std::map<unsigned, StringRef> ModuleIdMap;
 
+    /// Keeps track of source locations for Values, BasicBlocks, and Functions.
+    AsmParserContext *ParserContext;
+
     /// Only the llvm-as tool may set this to false to bypass
     /// UpgradeDebuginfo so it can generate broken bitcode.
     bool UpgradeDebugInfo;
@@ -189,10 +193,11 @@ namespace llvm {
   public:
     LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M,
              ModuleSummaryIndex *Index, LLVMContext &Context,
-             SlotMapping *Slots = nullptr)
+             SlotMapping *Slots = nullptr,
+             AsmParserContext *ParserContext = nullptr)
         : Context(Context), OPLex(F, SM, Err, Context),
           Lex(F, SM, Err, Context), M(M), Index(Index), Slots(Slots),
-          BlockAddressPFS(nullptr) {}
+          BlockAddressPFS(nullptr), ParserContext(ParserContext) {}
     bool Run(
         bool UpgradeDebugInfo,
         DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) {
diff --git a/llvm/include/llvm/AsmParser/Parser.h b/llvm/include/llvm/AsmParser/Parser.h
index c900b79..22b0881 100644
--- a/llvm/include/llvm/AsmParser/Parser.h
+++ b/llvm/include/llvm/AsmParser/Parser.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/AsmParserContext.h"
 #include "llvm/Support/Compiler.h"
 #include <memory>
 #include <optional>
@@ -62,7 +63,8 @@ parseAssemblyFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
 ///              parsing.
 LLVM_ABI std::unique_ptr<Module>
 parseAssemblyString(StringRef AsmString, SMDiagnostic &Err,
-                    LLVMContext &Context, SlotMapping *Slots = nullptr);
+                    LLVMContext &Context, SlotMapping *Slots = nullptr,
+                    AsmParserContext *ParserContext = nullptr);
 
 /// Holds the Module and ModuleSummaryIndex returned by the interfaces
 /// that parse both.
@@ -128,9 +130,9 @@ parseSummaryIndexAssemblyString(StringRef AsmString, SMDiagnostic &Err);
 LLVM_ABI std::unique_ptr<Module> parseAssembly(
     MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
     SlotMapping *Slots = nullptr,
-    DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) {
-      return std::nullopt;
-    });
+    DataLayoutCallbackTy DataLayoutCallback =
+        [](StringRef, StringRef) { return std::nullopt; },
+    AsmParserContext *ParserContext = nullptr);
 
 /// Parse LLVM Assembly including the summary index from a MemoryBuffer.
 ///
@@ -169,9 +171,9 @@ parseSummaryIndexAssembly(MemoryBufferRef F, SMDiagnostic &Err);
 LLVM_ABI bool parseAssemblyInto(
     MemoryBufferRef F, Module *M, ModuleSummaryIndex *Index, SMDiagnostic &Err,
     SlotMapping *Slots = nullptr,
-    DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) {
-      return std::nullopt;
-    });
+    DataLayoutCallbackTy DataLayoutCallback =
+        [](StringRef, StringRef) { return std::nullopt; },
+    AsmParserContext *ParserContext = nullptr);
 
 /// Parse a type and a constant value in the given string.
 ///
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 4f27d9f1..76b6c8e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -366,7 +366,7 @@ private:
 protected:
   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
       : BaseT(DL) {}
-  virtual ~BasicTTIImplBase() = default;
+  ~BasicTTIImplBase() override = default;
 
   using TargetTransformInfoImplBase::DL;
 
@@ -821,13 +821,13 @@ public:
         SimplifyAndSetOp);
   }
 
-  virtual std::optional<unsigned>
+  std::optional<unsigned>
   getCacheSize(TargetTransformInfo::CacheLevel Level) const override {
     return std::optional<unsigned>(
         getST()->getCacheSize(static_cast<unsigned>(Level)));
   }
 
-  virtual std::optional<unsigned>
+  std::optional<unsigned>
   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override {
     std::optional<unsigned> TargetResult =
         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
@@ -838,31 +838,31 @@ public:
     return BaseT::getCacheAssociativity(Level);
   }
 
-  virtual unsigned getCacheLineSize() const override {
+  unsigned getCacheLineSize() const override {
     return getST()->getCacheLineSize();
   }
 
-  virtual unsigned getPrefetchDistance() const override {
+  unsigned getPrefetchDistance() const override {
     return getST()->getPrefetchDistance();
   }
 
-  virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
-                                        unsigned NumStridedMemAccesses,
-                                        unsigned NumPrefetches,
-                                        bool HasCall) const override {
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) const override {
     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
                                          NumPrefetches, HasCall);
   }
 
-  virtual unsigned getMaxPrefetchIterationsAhead() const override {
+  unsigned getMaxPrefetchIterationsAhead() const override {
     return getST()->getMaxPrefetchIterationsAhead();
   }
 
-  virtual bool enableWritePrefetching() const override {
+  bool enableWritePrefetching() const override {
     return getST()->enableWritePrefetching();
   }
 
-  virtual bool shouldPrefetchAddressSpace(unsigned AS) const override {
+  bool shouldPrefetchAddressSpace(unsigned AS) const override {
     return getST()->shouldPrefetchAddressSpace(AS);
   }
 
diff --git a/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h b/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h
index 3950b95..7a6feda 100644
--- a/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h
+++ b/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h
@@ -42,7 +42,7 @@ public:
 
   /// Create a pass configuration object to be used by addPassToEmitX methods
   /// for generating a pipeline of CodeGen passes.
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   /// Add passes to the specified pass manager to get the specified file
   /// emitted.  Typically this will involve several steps of code generation.
diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index 39c5a8d..af66f2d 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -58,8 +58,6 @@ LLVM_ABI CodeGenFileType getFileType();
 
 LLVM_ABI FramePointerKind getFramePointerUsage();
 
-LLVM_ABI bool getEnableUnsafeFPMath();
-
 LLVM_ABI bool getEnableNoInfsFPMath();
 
 LLVM_ABI bool getEnableNoNaNsFPMath();
diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index fee4bb1..e72801b 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -118,7 +118,7 @@ private:
 
   // AsmPrinterHandler overrides.
 public:
-  virtual ~DebugHandlerBase() override;
+  ~DebugHandlerBase() override;
 
   void beginModule(Module *M) override;
 
diff --git a/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h b/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h
index bc8dc1b..6da10d8 100644
--- a/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h
+++ b/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h
@@ -44,12 +44,11 @@ private:
                                                  StringRef FuncOrModName);
   /// Override base class method to run on an llvm::MachineFunction
   /// specifically.
-  virtual void
-  visitEveryInstruction(unsigned &DroppedCount,
-                        DenseMap<VarID, DILocation *> &InlinedAtsMap,
-                        VarID Var) override;
+  void visitEveryInstruction(unsigned &DroppedCount,
+                             DenseMap<VarID, DILocation *> &InlinedAtsMap,
+                             VarID Var) override;
   /// Override base class method to run on DBG_VALUEs specifically.
-  virtual void visitEveryDebugRecord(
+  void visitEveryDebugRecord(
       DenseSet<VarID> &VarIDSet,
       DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
       StringRef FuncName, bool Before) override;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
index ea3f1a8..6701ae0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
@@ -40,14 +40,14 @@ public:
 // A CSE config for fully optimized builds.
 class LLVM_ABI CSEConfigFull : public CSEConfigBase {
 public:
-  virtual ~CSEConfigFull() = default;
+  ~CSEConfigFull() override = default;
   bool shouldCSEOpc(unsigned Opc) override;
 };
 
 // Commonly used for O0 config.
 class LLVM_ABI CSEConfigConstantOnly : public CSEConfigBase {
 public:
-  virtual ~CSEConfigConstantOnly() = default;
+  ~CSEConfigConstantOnly() override = default;
   bool shouldCSEOpc(unsigned Opc) override;
 };
 
@@ -118,7 +118,7 @@ class LLVM_ABI GISelCSEInfo : public GISelChangeObserver {
 public:
   GISelCSEInfo() = default;
 
-  virtual ~GISelCSEInfo();
+  ~GISelCSEInfo() override;
 
   void setMF(MachineFunction &MF);
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h b/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
index 39ff90c..7a313f4 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
@@ -60,7 +60,7 @@ public:
   Combiner(MachineFunction &MF, CombinerInfo &CInfo,
            const TargetPassConfig *TPC, GISelValueTracking *VT,
            GISelCSEInfo *CSEInfo = nullptr);
-  virtual ~Combiner();
+  ~Combiner() override;
 
   virtual bool tryCombineAll(MachineInstr &I) const = 0;
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
index 2db66ba..17d656a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
@@ -58,7 +58,7 @@ class LLVM_ABI GISelValueTracking : public GISelChangeObserver {
 
 public:
   GISelValueTracking(MachineFunction &MF, unsigned MaxDepth = 6);
-  ~GISelValueTracking() = default;
+  ~GISelValueTracking() override = default;
 
   const MachineFunction &getMachineFunction() const { return MF; }
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 3d7ccd5..268025e7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -656,7 +656,7 @@ private:
       IRT->addSuccessorWithProb(Src, Dst, Prob);
     }
 
-    virtual ~GISelSwitchLowering() = default;
+    ~GISelSwitchLowering() override = default;
 
   private:
     IRTranslator *IRT;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
index cf65f34..5694079 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
@@ -21,7 +21,7 @@ class GISelObserverWrapper;
 
 class LLVM_ABI InstructionSelector : public GIMatchTableExecutor {
 public:
-  virtual ~InstructionSelector();
+  ~InstructionSelector() override;
 
   /// Select the (possibly generic) instruction \p I to only use target-specific
   /// opcodes. It is OK to insert multiple instructions, but they cannot be
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index b7ccfbb..8db99ba 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -210,8 +210,8 @@ struct SpecificConstantMatch {
 };
 
 /// Matches a constant equal to \p RequestedValue.
-inline SpecificConstantMatch m_SpecificICst(APInt RequestedValue) {
-  return SpecificConstantMatch(std::move(RequestedValue));
+inline SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue) {
+  return SpecificConstantMatch(RequestedValue);
 }
 
 inline SpecificConstantMatch m_SpecificICst(int64_t RequestedValue) {
@@ -221,7 +221,7 @@ inline SpecificConstantMatch m_SpecificICst(int64_t RequestedValue) {
 /// Matcher for a specific constant splat.
 struct SpecificConstantSplatMatch {
   APInt RequestedVal;
-  SpecificConstantSplatMatch(const APInt RequestedVal)
+  SpecificConstantSplatMatch(const APInt &RequestedVal)
       : RequestedVal(RequestedVal) {}
   bool match(const MachineRegisterInfo &MRI, Register Reg) {
     return isBuildVectorConstantSplat(Reg, MRI, RequestedVal,
@@ -230,8 +230,9 @@ struct SpecificConstantSplatMatch {
 };
 
 /// Matches a constant splat of \p RequestedValue.
-inline SpecificConstantSplatMatch m_SpecificICstSplat(APInt RequestedValue) {
-  return SpecificConstantSplatMatch(std::move(RequestedValue));
+inline SpecificConstantSplatMatch
+m_SpecificICstSplat(const APInt &RequestedValue) {
+  return SpecificConstantSplatMatch(RequestedValue);
 }
 
 inline SpecificConstantSplatMatch m_SpecificICstSplat(int64_t RequestedValue) {
@@ -242,7 +243,7 @@ inline SpecificConstantSplatMatch m_SpecificICstSplat(int64_t RequestedValue) {
 /// Matcher for a specific constant or constant splat.
 struct SpecificConstantOrSplatMatch {
   APInt RequestedVal;
-  SpecificConstantOrSplatMatch(const APInt RequestedVal)
+  SpecificConstantOrSplatMatch(const APInt &RequestedVal)
       : RequestedVal(RequestedVal) {}
   bool match(const MachineRegisterInfo &MRI, Register Reg) {
     APInt MatchedVal;
@@ -263,8 +264,8 @@ struct SpecificConstantOrSplatMatch {
 /// Matches a \p RequestedValue constant or a constant splat of \p
 /// RequestedValue.
 inline SpecificConstantOrSplatMatch
-m_SpecificICstOrSplat(APInt RequestedValue) {
-  return SpecificConstantOrSplatMatch(std::move(RequestedValue));
+m_SpecificICstOrSplat(const APInt &RequestedValue) {
+  return SpecificConstantOrSplatMatch(RequestedValue);
 }
 
 inline SpecificConstantOrSplatMatch
diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h
index f6b0571..4bcbad7 100644
--- a/llvm/include/llvm/CodeGen/MIR2Vec.h
+++ b/llvm/include/llvm/CodeGen/MIR2Vec.h
@@ -35,6 +35,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -61,7 +63,7 @@ class MIREmbedder;
 class SymbolicMIREmbedder;
 
 extern llvm::cl::OptionCategory MIR2VecCategory;
-extern cl::opt<float> OpcWeight;
+extern cl::opt<float> OpcWeight, CommonOperandWeight, RegOperandWeight;
 
 using Embedding = ir2vec::Embedding;
 using MachineInstEmbeddingsMap = DenseMap<const MachineInstr *, Embedding>;
@@ -74,31 +76,114 @@ class MIRVocabulary {
   friend class llvm::MIR2VecVocabLegacyAnalysis;
   using VocabMap = std::map<std::string, ir2vec::Embedding>;
 
-private:
-  // Define vocabulary layout - adapted for MIR
+  // MIRVocabulary Layout:
+  // +-------------------+-----------------------------------------------------+
+  // | Entity Type       | Description                                         |
+  // +-------------------+-----------------------------------------------------+
+  // | 1. Opcodes        | Target specific opcodes derived from TII, grouped   |
+  // |                   | by instruction semantics.                           |
+  // | 2. Common Operands| All common operand types, except register operands, |
+  // |                   | defined by MachineOperand::MachineOperandType enum. |
+  // | 3. Physical       | Register classes defined by the target, specialized |
+  // |    Reg classes    | by physical registers.                              |
+  // | 4. Virtual        | Register classes defined by the target, specialized |
+  // |    Reg classes    | by virtual and physical registers.                  |
+  // +-------------------+-----------------------------------------------------+
+
+  /// Layout information for the MIR vocabulary. Defines the starting index
+  /// and size of each section in the vocabulary.
   struct {
     size_t OpcodeBase = 0;
-    size_t OperandBase = 0;
+    size_t CommonOperandBase = 0;
+    size_t PhyRegBase = 0;
+    size_t VirtRegBase = 0;
     size_t TotalEntries = 0;
   } Layout;
 
-  enum class Section : unsigned { Opcodes = 0, MaxSections };
+  enum class Section : unsigned {
+    Opcodes = 0,
+    CommonOperands = 1,
+    PhyRegisters = 2,
+    VirtRegisters = 3,
+    MaxSections
+  };
 
   ir2vec::VocabStorage Storage;
-  mutable std::set<std::string> UniqueBaseOpcodeNames;
+  std::set<std::string> UniqueBaseOpcodeNames;
+  SmallVector<std::string, 24> RegisterOperandNames;
+
+  // Some instructions have optional register operands that may be NoRegister.
+  // We return a zero vector in such cases.
+  Embedding ZeroEmbedding;
+
+  // We have specialized MO_Register handling in the Register operand section,
+  // so we don't include it here. Also, no MO_DbgInstrRef for now.
+  static constexpr StringLiteral CommonOperandNames[] = {
+      "Immediate",       "CImmediate",        "FPImmediate",  "MBB",
+      "FrameIndex",      "ConstantPoolIndex", "TargetIndex",  "JumpTableIndex",
+      "ExternalSymbol",  "GlobalAddress",     "BlockAddress", "RegisterMask",
+      "RegisterLiveOut", "Metadata",          "MCSymbol",     "CFIIndex",
+      "IntrinsicID",     "Predicate",         "ShuffleMask"};
+  static_assert(std::size(CommonOperandNames) == MachineOperand::MO_Last - 1 &&
+                "Common operand names size changed, update accordingly");
+
   const TargetInstrInfo &TII;
-  void generateStorage(const VocabMap &OpcodeMap);
+  const TargetRegisterInfo &TRI;
+  const MachineRegisterInfo &MRI;
+
+  void generateStorage(const VocabMap &OpcodeMap,
+                       const VocabMap &CommonOperandMap,
+                       const VocabMap &PhyRegMap, const VocabMap &VirtRegMap);
   void buildCanonicalOpcodeMapping();
+  void buildRegisterOperandMapping();
 
   /// Get canonical index for a machine opcode
   unsigned getCanonicalOpcodeIndex(unsigned Opcode) const;
 
+  /// Get index for a common (non-register) machine operand
+  unsigned
+  getCommonOperandIndex(MachineOperand::MachineOperandType OperandType) const;
+
+  /// Get index for a register machine operand
+  unsigned getRegisterOperandIndex(Register Reg) const;
+
+  // Accessors for operand types
+  const Embedding &
+  operator[](MachineOperand::MachineOperandType OperandType) const {
+    unsigned LocalIndex = getCommonOperandIndex(OperandType);
+    return Storage[static_cast<unsigned>(Section::CommonOperands)][LocalIndex];
+  }
+
+  const Embedding &operator[](Register Reg) const {
+    // Reg is sometimes NoRegister (0) for optional operands. We return a zero
+    // vector in this case.
+    if (!Reg.isValid())
+      return ZeroEmbedding;
+    // TODO: Implement proper stack slot handling for MIR2Vec embeddings.
+    // Stack slots represent frame indices and should have their own
+    // embedding strategy rather than defaulting to register class 0.
+    // Consider: 1) Separate vocabulary section for stack slots
+    //          2) Stack slot size/alignment based embeddings
+    //          3) Frame index based categorization
+    if (Reg.isStack())
+      return ZeroEmbedding;
+
+    unsigned LocalIndex = getRegisterOperandIndex(Reg);
+    auto SectionID =
+        Reg.isPhysical() ? Section::PhyRegisters : Section::VirtRegisters;
+    return Storage[static_cast<unsigned>(SectionID)][LocalIndex];
+  }
+
 public:
   /// Static method for extracting base opcode names (public for testing)
   static std::string extractBaseOpcodeName(StringRef InstrName);
 
-  /// Get canonical index for base name (public for testing)
+  /// Get indices from opcode or operand names. These are public for testing.
+  /// String based lookups are inefficient and should be avoided in general.
   unsigned getCanonicalIndexForBaseName(StringRef BaseName) const;
+  unsigned getCanonicalIndexForOperandName(StringRef OperandName) const;
+  unsigned getCanonicalIndexForRegisterClass(StringRef RegName,
+                                             bool IsPhysical = true) const;
 
   /// Get the string key for a vocabulary entry at the given position
   std::string getStringKey(unsigned Pos) const;
@@ -111,6 +196,14 @@ public:
     return Storage[static_cast<unsigned>(Section::Opcodes)][LocalIndex];
   }
 
+  const Embedding &operator[](MachineOperand Operand) const {
+    auto OperandType = Operand.getType();
+    if (OperandType == MachineOperand::MO_Register)
+      return operator[](Operand.getReg());
+    else
+      return operator[](OperandType);
+  }
+
   // Iterator access
   using const_iterator = ir2vec::VocabStorage::const_iterator;
   const_iterator begin() const { return Storage.begin(); }
@@ -120,18 +213,25 @@ public:
   MIRVocabulary() = delete;
 
   /// Factory method to create MIRVocabulary from vocabulary map
-  static Expected<MIRVocabulary> create(VocabMap &&Entries,
-                                        const TargetInstrInfo &TII);
+  static Expected<MIRVocabulary>
+  create(VocabMap &&OpcMap, VocabMap &&CommonOperandsMap, VocabMap &&PhyRegMap,
+         VocabMap &&VirtRegMap, const TargetInstrInfo &TII,
+         const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI);
 
   /// Create a dummy vocabulary for testing purposes.
   static Expected<MIRVocabulary>
-  createDummyVocabForTest(const TargetInstrInfo &TII, unsigned Dim = 1);
+  createDummyVocabForTest(const TargetInstrInfo &TII,
+                          const TargetRegisterInfo &TRI,
+                          const MachineRegisterInfo &MRI, unsigned Dim = 1);
 
   /// Total number of entries in the vocabulary
   size_t getCanonicalSize() const { return Storage.size(); }
 
 private:
-  MIRVocabulary(VocabMap &&Entries, const TargetInstrInfo &TII);
+  MIRVocabulary(VocabMap &&OpcMap, VocabMap &&CommonOperandsMap,
+                VocabMap &&PhyRegMap, VocabMap &&VirtRegMap,
+                const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+                const MachineRegisterInfo &MRI);
 };
 
 /// Base class for MIR embedders
@@ -144,11 +244,13 @@ protected:
   const unsigned Dimension;
 
   /// Weight for opcode embeddings
-  const float OpcWeight;
+  const float OpcWeight, CommonOperandWeight, RegOperandWeight;
 
   MIREmbedder(const MachineFunction &MF, const MIRVocabulary &Vocab)
       : MF(MF), Vocab(Vocab), Dimension(Vocab.getDimension()),
-        OpcWeight(mir2vec::OpcWeight) {}
+        OpcWeight(mir2vec::OpcWeight),
+        CommonOperandWeight(mir2vec::CommonOperandWeight),
+        RegOperandWeight(mir2vec::RegOperandWeight) {}
 
   /// Function to compute embeddings.
   Embedding computeEmbeddings() const;
@@ -208,11 +310,11 @@ public:
 class MIR2VecVocabLegacyAnalysis : public ImmutablePass {
   using VocabVector = std::vector<mir2vec::Embedding>;
   using VocabMap = std::map<std::string, mir2vec::Embedding>;
-  VocabMap StrVocabMap;
-  VocabVector Vocab;
+  std::optional<mir2vec::MIRVocabulary> Vocab;
 
   StringRef getPassName() const override;
-  Error readVocabulary();
+  Error readVocabulary(VocabMap &OpcVocab, VocabMap &CommonOperandVocab,
+                       VocabMap &PhyRegVocabMap, VocabMap &VirtRegVocabMap);
 
 protected:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -275,4 +377,4 @@ MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
 
 } // namespace llvm
 
-#endif // LLVM_CODEGEN_MIR2VEC_H
-\ No newline at end of file
+#endif // LLVM_CODEGEN_MIR2VEC_H
diff --git a/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h b/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
index 770f1b3..5504896 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
@@ -37,7 +37,7 @@ public:
   MachineModuleSlotTracker(const MachineModuleInfo &MMI,
                            const MachineFunction *MF,
                            bool ShouldInitializeAllMetadata = true);
-  ~MachineModuleSlotTracker();
+  ~MachineModuleSlotTracker() override;
 
   void collectMachineMDNodes(MachineMDNodeListType &L) const;
 };
diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index fbb958cc..66cab3d 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -306,7 +306,7 @@ struct GlobalOutlinedFunction : public OutlinedFunction {
   }
 
   GlobalOutlinedFunction() = delete;
-  ~GlobalOutlinedFunction() = default;
+  ~GlobalOutlinedFunction() override = default;
 };
 
 } // namespace outliner
diff --git a/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h b/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
index c15bc67..0af4f47 100644
--- a/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
+++ b/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
@@ -75,7 +75,7 @@ namespace llvm {
 
   public:
     ResourcePriorityQueue(SelectionDAGISel *IS);
-    ~ResourcePriorityQueue();
+    ~ResourcePriorityQueue() override;
 
     bool isBottomUp() const override { return false; }
 
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 201dc68..0dcf400 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -559,6 +559,11 @@ m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F) {
 }
 
 template <typename T0_P, typename T1_P, typename T2_P>
+inline auto m_SelectLike(const T0_P &Cond, const T1_P &T, const T2_P &F) {
+  return m_AnyOf(m_Select(Cond, T, F), m_VSelect(Cond, T, F));
+}
+
+template <typename T0_P, typename T1_P, typename T2_P>
 inline Result_match<0, TernaryOpc_match<T0_P, T1_P, T2_P>>
 m_Load(const T0_P &Ch, const T1_P &Ptr, const T2_P &Offset) {
   return m_Result<0>(
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 4eacbdc..26d7080 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseMultiSet.h"
-#include "llvm/ADT/identity.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 822245f..f031353 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -280,7 +280,7 @@ protected:
                      unsigned Mode = 0);
 
 public:
-  virtual ~TargetRegisterInfo();
+  ~TargetRegisterInfo() override;
 
   /// Return the number of registers for the function. (may overestimate)
   virtual unsigned getNumSupportedRegs(const MachineFunction &) const {
diff --git a/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h b/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h
index 112ff6d..65ff1eb 100644
--- a/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h
@@ -223,7 +223,7 @@ public:
   enum { TopQID = 1, BotQID = 2, LogMaxQID = 2 };
 
   ConvergingVLIWScheduler() : Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
-  virtual ~ConvergingVLIWScheduler() = default;
+  ~ConvergingVLIWScheduler() override = default;
 
   void initialize(ScheduleDAGMI *dag) override;
 
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 8e7d9dc..8ce2b1b 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -410,7 +410,6 @@ def LessPreciseFPMAD : StrBoolAttr<"less-precise-fpmad">;
 def NoInfsFPMath : StrBoolAttr<"no-infs-fp-math">;
 def NoNansFPMath : StrBoolAttr<"no-nans-fp-math">;
 def NoSignedZerosFPMath : StrBoolAttr<"no-signed-zeros-fp-math">;
-def UnsafeFPMath : StrBoolAttr<"unsafe-fp-math">;
 def NoJumpTables : StrBoolAttr<"no-jump-tables">;
 def NoInlineLineTables : StrBoolAttr<"no-inline-line-tables">;
 def ProfileSampleAccurate : StrBoolAttr<"profile-sample-accurate">;
@@ -474,7 +473,6 @@ def : MergeRule<"setAND<LessPreciseFPMADAttr>">;
 def : MergeRule<"setAND<NoInfsFPMathAttr>">;
 def : MergeRule<"setAND<NoNansFPMathAttr>">;
 def : MergeRule<"setAND<NoSignedZerosFPMathAttr>">;
-def : MergeRule<"setAND<UnsafeFPMathAttr>">;
 def : MergeRule<"setOR<NoImplicitFloatAttr>">;
 def : MergeRule<"setOR<NoJumpTablesAttr>">;
 def : MergeRule<"setOR<ProfileSampleAccurateAttr>">;
diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index 31096e8..540d60a 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -96,6 +96,16 @@ namespace llvm {
   /// info. Return true if module is modified.
   LLVM_ABI bool UpgradeDebugInfo(Module &M);
 
+  /// Copies module attributes to the functions in the module.
+  /// Currently only effects ARM, Thumb and AArch64 targets.
+  /// Supported attributes:
+  ///  - branch-target-enforcement
+  ///  - branch-protection-pauth-lr
+  ///  - guarded-control-stack
+  ///  - sign-return-address
+  ///  - sign-return-address-with-bkey
+  void copyModuleAttrToFunctions(Module &M);
+
   /// Check whether a string looks like an old loop attachment tag.
   inline bool mayBeOldLoopAttachmentTag(StringRef Name) {
     return Name.starts_with("llvm.vectorizer.");
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 0062cec..98df06a 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1449,6 +1449,9 @@ private:
   /// every summary of a GV is synchronized.
   bool WithDSOLocalPropagation = false;
 
+  /// Indicates that summary-based internalization and promotion has run.
+  bool WithInternalizeAndPromote = false;
+
   /// Indicates that we have whole program visibility.
   bool WithWholeProgramVisibility = false;
 
@@ -1653,6 +1656,9 @@ public:
   bool withDSOLocalPropagation() const { return WithDSOLocalPropagation; }
   void setWithDSOLocalPropagation() { WithDSOLocalPropagation = true; }
 
+  bool withInternalizeAndPromote() const { return WithInternalizeAndPromote; }
+  void setWithInternalizeAndPromote() { WithInternalizeAndPromote = true; }
+
   bool withWholeProgramVisibility() const { return WithWholeProgramVisibility; }
   void setWithWholeProgramVisibility() { WithWholeProgramVisibility = true; }
 
diff --git a/llvm/include/llvm/IRReader/IRReader.h b/llvm/include/llvm/IRReader/IRReader.h
index 790140f..00cf12d 100644
--- a/llvm/include/llvm/IRReader/IRReader.h
+++ b/llvm/include/llvm/IRReader/IRReader.h
@@ -15,6 +15,7 @@
 #define LLVM_IRREADER_IRREADER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/AsmParserContext.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Support/Compiler.h"
 #include <memory>
@@ -50,19 +51,19 @@ getLazyIRFileModule(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
 /// for it.  Otherwise, attempt to parse it as LLVM Assembly and return
 /// a Module for it.
 /// \param DataLayoutCallback Override datalayout in the llvm assembly.
-LLVM_ABI std::unique_ptr<Module> parseIR(MemoryBufferRef Buffer,
-                                         SMDiagnostic &Err,
-                                         LLVMContext &Context,
-                                         ParserCallbacks Callbacks = {});
+LLVM_ABI std::unique_ptr<Module>
+parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err, LLVMContext &Context,
+        ParserCallbacks Callbacks = {},
+        AsmParserContext *ParserContext = nullptr);
 
 /// If the given file holds a bitcode image, return a Module for it.
 /// Otherwise, attempt to parse it as LLVM Assembly and return a Module
 /// for it.
 /// \param DataLayoutCallback Override datalayout in the llvm assembly.
-LLVM_ABI std::unique_ptr<Module> parseIRFile(StringRef Filename,
-                                             SMDiagnostic &Err,
-                                             LLVMContext &Context,
-                                             ParserCallbacks Callbacks = {});
+LLVM_ABI std::unique_ptr<Module>
+parseIRFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
+            ParserCallbacks Callbacks = {},
+            AsmParserContext *ParserContext = nullptr);
 }
 
 #endif
diff --git a/llvm/include/llvm/Support/AllocToken.h b/llvm/include/llvm/Support/AllocToken.h
new file mode 100644
index 0000000..e40d816
--- /dev/null
+++ b/llvm/include/llvm/Support/AllocToken.h
@@ -0,0 +1,68 @@
+//===- llvm/Support/AllocToken.h - Allocation Token Calculation -----*- C++ -*//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of AllocToken modes and shared calculation of stateless token IDs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ALLOCTOKEN_H
+#define LLVM_SUPPORT_ALLOCTOKEN_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <optional>
+
+namespace llvm {
+
+/// Modes for generating allocation token IDs.
+enum class AllocTokenMode {
+  /// Incrementally increasing token ID.
+  Increment,
+
+  /// Simple mode that returns a statically-assigned random token ID.
+  Random,
+
+  /// Token ID based on allocated type hash.
+  TypeHash,
+
+  /// Token ID based on allocated type hash, where the top half ID-space is
+  /// reserved for types that contain pointers and the bottom half for types
+  /// that do not contain pointers.
+  TypeHashPointerSplit,
+};
+
+/// The default allocation token mode.
+inline constexpr AllocTokenMode DefaultAllocTokenMode =
+    AllocTokenMode::TypeHashPointerSplit;
+
+/// Returns the AllocTokenMode from its canonical string name; if an invalid
+/// name was provided returns nullopt.
+LLVM_ABI std::optional<AllocTokenMode>
+getAllocTokenModeFromString(StringRef Name);
+
+/// Metadata about an allocation used to generate a token ID.
+struct AllocTokenMetadata {
+  SmallString<64> TypeName;
+  bool ContainsPointer;
+};
+
+/// Calculates stable allocation token ID. Returns std::nullopt for stateful
+/// modes that are only available in the AllocToken pass.
+///
+/// \param Mode The token generation mode.
+/// \param Metadata The metadata about the allocation.
+/// \param MaxTokens The maximum number of tokens (must not be 0)
+/// \return The calculated allocation token ID, or std::nullopt.
+LLVM_ABI std::optional<uint64_t>
+getAllocToken(AllocTokenMode Mode, const AllocTokenMetadata &Metadata,
+              uint64_t MaxTokens);
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_ALLOCTOKEN_H
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index 2c2122a..bfd2817 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -118,9 +118,8 @@ enum CodeObjectVersionKind {
 class TargetOptions {
 public:
   TargetOptions()
-      : UnsafeFPMath(false), NoInfsFPMath(false), NoNaNsFPMath(false),
-        NoTrappingFPMath(true), NoSignedZerosFPMath(false),
-        EnableAIXExtendedAltivecABI(false),
+      : NoInfsFPMath(false), NoNaNsFPMath(false), NoTrappingFPMath(true),
+        NoSignedZerosFPMath(false), EnableAIXExtendedAltivecABI(false),
         HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false),
         GuaranteedTailCallOpt(false), StackSymbolOrdering(true),
         EnableFastISel(false), EnableGlobalISel(false), UseInitArray(false),
@@ -156,13 +155,6 @@ public:
   /// MCAsmInfo::BinutilsVersion.
   std::pair<int, int> BinutilsVersion{0, 0};
 
-  /// UnsafeFPMath - This flag is enabled when the
-  /// -enable-unsafe-fp-math flag is specified on the command line.  When
-  /// this flag is off (the default), the code generator is not allowed to
-  /// produce results that are "less precise" than IEEE allows.  This includes
-  /// use of X86 instructions like FSIN and FCOS instead of libcalls.
-  unsigned UnsafeFPMath : 1;
-
   /// NoInfsFPMath - This flag is enabled when the
   /// -enable-no-infs-fp-math flag is specified on the command line. When
   /// this flag is off (the default), the code generator is not allowed to
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h b/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h
index b1391cb0..077703c 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h
@@ -16,6 +16,7 @@
 
 #include "llvm/IR/Analysis.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/AllocToken.h"
 #include <optional>
 
 namespace llvm {
@@ -23,6 +24,7 @@ namespace llvm {
 class Module;
 
 struct AllocTokenOptions {
+  AllocTokenMode Mode = DefaultAllocTokenMode;
   std::optional<uint64_t> MaxTokens;
   bool FastABI = false;
   bool Extended = false;
diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 979f3b3e..e677cbf 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -21,6 +21,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Printable.h"
 #include <cassert>
 
 namespace llvm {
@@ -611,6 +612,10 @@ LLVM_ABI void InvertBranch(BranchInst *PBI, IRBuilderBase &Builder);
 // br/brcond/unreachable/ret
 LLVM_ABI bool hasOnlySimpleTerminator(const Function &F);
 
+/// Print BasicBlock \p BB as an operand or print "<nullptr>" if \p BB is a
+/// nullptr.
+LLVM_ABI Printable printBasicBlock(const BasicBlock *BB);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_BASICBLOCKUTILS_H
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index f9bf092..6f19a68 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -255,6 +255,12 @@ static void formatTypeName(SmallString<64> &Dest, StringRef Name,
   if (!ContainedType)
     return;
 
+  SmallVector<uint64_t> ArrayDimensions;
+  while (ArrayType *AT = dyn_cast<ArrayType>(ContainedType)) {
+    ArrayDimensions.push_back(AT->getNumElements());
+    ContainedType = AT->getElementType();
+  }
+
   StringRef ElementName;
   ElementType ET = toDXILElementType(ContainedType, IsSigned);
   if (ET != ElementType::Invalid) {
@@ -271,6 +277,8 @@ static void formatTypeName(SmallString<64> &Dest, StringRef Name,
   DestStream << "<" << ElementName;
   if (const FixedVectorType *VTy = dyn_cast<FixedVectorType>(ContainedType))
     DestStream << VTy->getNumElements();
+  for (uint64_t Dim : ArrayDimensions)
+    DestStream << "[" << Dim << "]";
   DestStream << ">";
 }
 
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index b573023..8da51d0 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4866,89 +4866,6 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F,
   return nullptr;
 }
 
-/// Look for the following pattern and simplify %to_fold to %identicalPhi.
-/// Here %phi, %to_fold and %phi.next perform the same functionality as
-/// %identicalPhi and hence the select instruction %to_fold can be folded
-/// into %identicalPhi.
-///
-/// BB1:
-///   %identicalPhi = phi [ X, %BB0 ], [ %identicalPhi.next, %BB1 ]
-///   %phi = phi [ X, %BB0 ], [ %phi.next, %BB1 ]
-///   ...
-///   %identicalPhi.next = select %cmp, %val, %identicalPhi
-///                      (or select %cmp, %identicalPhi, %val)
-///   %to_fold = select %cmp2, %identicalPhi, %phi
-///   %phi.next = select %cmp, %val, %to_fold
-///             (or select %cmp, %to_fold, %val)
-///
-/// Prove that %phi and %identicalPhi are the same by induction:
-///
-/// Base case: Both %phi and %identicalPhi are equal on entry to the loop.
-/// Inductive case:
-/// Suppose %phi and %identicalPhi are equal at iteration i.
-/// We look at their values at iteration i+1 which are %phi.next and
-/// %identicalPhi.next. They would have become different only when %cmp is
-/// false and the corresponding values %to_fold and %identicalPhi differ
-/// (similar reason for the other "or" case in the bracket).
-///
-/// The only condition when %to_fold and %identicalPh could differ is when %cmp2
-/// is false and %to_fold is %phi, which contradicts our inductive hypothesis
-/// that %phi and %identicalPhi are equal. Thus %phi and %identicalPhi are
-/// always equal at iteration i+1.
-bool isSimplifierIdenticalPHI(PHINode &PN, PHINode &IdenticalPN) {
-  if (PN.getParent() != IdenticalPN.getParent())
-    return false;
-  if (PN.getNumIncomingValues() != 2)
-    return false;
-
-  // Check that only the backedge incoming value is different.
-  unsigned DiffVals = 0;
-  BasicBlock *DiffValBB = nullptr;
-  for (unsigned i = 0; i < 2; i++) {
-    BasicBlock *PredBB = PN.getIncomingBlock(i);
-    if (PN.getIncomingValueForBlock(PredBB) !=
-        IdenticalPN.getIncomingValueForBlock(PredBB)) {
-      DiffVals++;
-      DiffValBB = PredBB;
-    }
-  }
-  if (DiffVals != 1)
-    return false;
-  // Now check that the backedge incoming values are two select
-  // instructions with the same condition. Either their true
-  // values are the same, or their false values are the same.
-  auto *SI = dyn_cast<SelectInst>(PN.getIncomingValueForBlock(DiffValBB));
-  auto *IdenticalSI =
-      dyn_cast<SelectInst>(IdenticalPN.getIncomingValueForBlock(DiffValBB));
-  if (!SI || !IdenticalSI)
-    return false;
-  if (SI->getCondition() != IdenticalSI->getCondition())
-    return false;
-
-  SelectInst *SIOtherVal = nullptr;
-  Value *IdenticalSIOtherVal = nullptr;
-  if (SI->getTrueValue() == IdenticalSI->getTrueValue()) {
-    SIOtherVal = dyn_cast<SelectInst>(SI->getFalseValue());
-    IdenticalSIOtherVal = IdenticalSI->getFalseValue();
-  } else if (SI->getFalseValue() == IdenticalSI->getFalseValue()) {
-    SIOtherVal = dyn_cast<SelectInst>(SI->getTrueValue());
-    IdenticalSIOtherVal = IdenticalSI->getTrueValue();
-  } else {
-    return false;
-  }
-
-  // Now check that the other values in select, i.e., %to_fold and
-  // %identicalPhi, are essentially the same value.
-  if (!SIOtherVal || IdenticalSIOtherVal != &IdenticalPN)
-    return false;
-  if (!(SIOtherVal->getTrueValue() == &IdenticalPN &&
-        SIOtherVal->getFalseValue() == &PN) &&
-      !(SIOtherVal->getTrueValue() == &PN &&
-        SIOtherVal->getFalseValue() == &IdenticalPN))
-    return false;
-  return true;
-}
-
 /// Given operands for a SelectInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
@@ -5124,14 +5041,7 @@ static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
   std::optional<bool> Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL);
   if (Imp)
     return *Imp ? TrueVal : FalseVal;
-  // Look for same PHIs in the true and false values.
-  if (auto *TruePHI = dyn_cast<PHINode>(TrueVal))
-    if (auto *FalsePHI = dyn_cast<PHINode>(FalseVal)) {
-      if (isSimplifierIdenticalPHI(*TruePHI, *FalsePHI))
-        return FalseVal;
-      if (isSimplifierIdenticalPHI(*FalsePHI, *TruePHI))
-        return TrueVal;
-    }
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 6f7dd79..7597f3a 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -3768,13 +3768,11 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
   return getOrCreateAddRecExpr(Operands, L, Flags);
 }
 
-const SCEV *
-ScalarEvolution::getGEPExpr(GEPOperator *GEP,
-                            const SmallVectorImpl<const SCEV *> &IndexExprs) {
+const SCEV *ScalarEvolution::getGEPExpr(GEPOperator *GEP,
+                                        ArrayRef<const SCEV *> IndexExprs) {
   const SCEV *BaseExpr = getSCEV(GEP->getPointerOperand());
   // getSCEV(Base)->getType() has the same address space as Base->getType()
   // because SCEV::getType() preserves the address space.
-  Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType());
   GEPNoWrapFlags NW = GEP->getNoWrapFlags();
   if (NW != GEPNoWrapFlags::none()) {
     // We'd like to propagate flags from the IR to the corresponding SCEV nodes,
@@ -3787,13 +3785,20 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
       NW = GEPNoWrapFlags::none();
   }
 
+  return getGEPExpr(BaseExpr, IndexExprs, GEP->getSourceElementType(), NW);
+}
+
+const SCEV *ScalarEvolution::getGEPExpr(const SCEV *BaseExpr,
+                                        ArrayRef<const SCEV *> IndexExprs,
+                                        Type *SrcElementTy, GEPNoWrapFlags NW) {
   SCEV::NoWrapFlags OffsetWrap = SCEV::FlagAnyWrap;
   if (NW.hasNoUnsignedSignedWrap())
     OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNSW);
   if (NW.hasNoUnsignedWrap())
     OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNUW);
 
-  Type *CurTy = GEP->getType();
+  Type *CurTy = BaseExpr->getType();
+  Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType());
   bool FirstIter = true;
   SmallVector<const SCEV *, 4> Offsets;
   for (const SCEV *IndexExpr : IndexExprs) {
@@ -3812,7 +3817,7 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
       if (FirstIter) {
         assert(isa<PointerType>(CurTy) &&
                "The first index of a GEP indexes a pointer");
-        CurTy = GEP->getSourceElementType();
+        CurTy = SrcElementTy;
         FirstIter = false;
       } else {
         CurTy = GetElementPtrInst::getTypeAtIndex(CurTy, (uint64_t)0);
diff --git a/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json b/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json
index 0afe5c7..f026b0d 100644
--- a/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json
+++ b/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json
@@ -1,5 +1,5 @@
 {
-    "entities" : {
+    "Opcodes" : {
         "ABS_Fp":[0.07323841750621796, -0.006006906274706125, 0.09751169383525848, -0.011089739389717579, 0.06642112135887146, -0.015824640169739723, -0.021592319011688232, -0.0035401300992816687, 0.06047678738832474, -0.007392085622996092, 0.07134906202554703, -0.019624482840299606, -0.10975595563650131, -0.007685789838433266, 0.07451746612787247, 0.06384266912937164, -0.08230067789554596, 0.050922468304634094, 0.013724055141210556, 0.015687907114624977, -0.018451329320669174, 0.046987198293209076, -0.037734340876340866, -0.07235030829906464, 0.10218106210231781, 0.08037368208169937, -0.029537858441472054, -0.047520823776721954, -0.022125739604234695, -0.03125226870179176, -0.02882847562432289, 0.013811410404741764, 0.0023568253964185715, 0.017958490177989006, -0.05359291657805443, -0.03606243059039116, 0.07840022444725037, -0.016711654141545296, -0.038644544780254364, 0.05886651948094368, -0.011418955400586128, -0.04882095381617546, 0.04027162492275238, 0.001088760793209076, 0.03045983798801899, -0.10998888313770294, -0.0097441291436553, 0.015445191413164139, 0.030951637774705887, -0.06309321522712708, -0.019475746899843216, -0.029662512242794037, 0.05312168970704079, 0.05355998873710632, 0.05060160160064697, -0.053278811275959015, -0.01803833432495594, 0.010853713378310204, -0.053911495953798294, 0.06630647927522659, -0.08671313524246216, 0.0699775293469429, -0.08346731215715408, -0.045348167419433594, 0.06779918074607849, 0.008865933865308762, 0.05460203066468239, 0.007126103155314922, 0.0012282058596611023, 0.06817980855703354, 0.0216530654579401, 0.03552381321787834, 0.015414077788591385, -0.06002715229988098, 0.05233345925807953, 0.0782286673784256, 0.04220856353640556, -0.005762201733887196, 0.004772072657942772, 0.004578332882374525, 0.002619141712784767, 0.024511393159627914, -0.10089710354804993, 0.018322769552469254, 0.020811809226870537, -0.03358744457364082, -0.06896928697824478, -0.007399350870400667, -0.044467780739068985, -0.08094192296266556, -0.09795571863651276, 0.08391229063272476, -0.04749457910656929, 0.0029586481396108866, -5.354872337193228e-05, 0.005788655485957861, 0.015252145007252693, 0.06928747892379761, 0.041780371218919754, 0.016391364857554436],
         "ADC":[-0.07533542811870575, -0.01729339174926281, 0.04298720881342888, 0.015697332099080086, -0.04403507336974144, -0.059322185814380646, -0.050977922976017, 0.027526788413524628, -0.07009710371494293, -0.025621667504310608, 0.0352291613817215, -0.011538374237716198, 0.03682859241962433, -0.09788215160369873, -0.07216927409172058, -0.03659192472696304, 0.05676230415701866, -0.06369645893573761, -0.04756825789809227, 0.005865555722266436, 0.022270306944847107, -0.042112063616514206, 0.07008901983499527, 0.07748222351074219, -0.1020870953798294, -0.008511601015925407, -0.05725255608558655, -0.07881367206573486, 0.05627593398094177, -0.0005361076910048723, 0.03351512551307678, 0.04348289221525192, -0.08322969079017639, -0.02161242999136448, -0.07805898040533066, 0.04819482937455177, -0.061123576015233994, -0.010114834643900394, -0.04676959663629532, -0.008176938630640507, 0.010575453750789165, -0.04312445595860481, 0.00376943894661963, -0.0691257119178772, 0.03553615137934685, 0.10397598147392273, 0.009375158697366714, 0.001147320494055748, 0.026351911947131157, -0.0194610096514225, -0.05202522128820419, 0.014047946780920029, -0.040036872029304504, 0.06963572651147842, 0.04827437922358513, -0.06908547878265381, 0.024857567623257637, -0.03304143249988556, 0.02291242778301239, 0.07687342166900635, -0.05110599845647812, -0.00873416755348444, 0.026205750182271004, 0.045064594596624374, -0.03565925359725952, 0.09580051153898239, -0.02518773265182972, 0.047807395458221436, -0.03548192232847214, 0.08286304026842117, -0.053511787205934525, 0.02892065793275833, -0.0495525486767292, 0.02590095065534115, -0.006982128601521254, 0.006042638327926397, -0.07269058376550674, 0.02401554025709629, -0.05660006031394005, -0.026029467582702637, 0.05318204686045647, 0.06714116781949997, -0.0023821850772947073, 0.05028798058629036, -0.005811943672597408, -0.003296421840786934, -0.005409242119640112, -0.10150349885225296, -0.06406981498003006, 0.02553202211856842, -0.002790689468383789, 0.0663856491446495, 0.09109167754650116, -0.04678672179579735, 0.022019781172275543, 0.007821275852620602, 0.022490357980132103, -0.058503177016973495, 0.08841150254011154, -0.00892670825123787],
         "ADD":[-0.037626221776008606, 0.006784931290894747, 0.10051396489143372, -0.0014993306249380112, -0.0323498398065567, -0.03148593008518219, -0.014100957661867142, -0.020252650603652, 0.014126972295343876, -0.1295478343963623, 0.08520576357841492, -0.02513248659670353, 0.03539956361055374, -0.07019674777984619, -0.019069846719503403, 0.016678515821695328, -0.009174983017146587, -0.019034702330827713, -0.024083402007818222, -0.07829779386520386, -0.007908892817795277, -0.07924024760723114, -0.034599609673023224, 0.05271153524518013, 0.0016642026603221893, -0.03938138112425804, 0.0019624519627541304, 0.03562740981578827, 0.07340876758098602, 0.09457183629274368, -0.06507840752601624, 0.00246993126347661, -0.004548616707324982, 0.058226197957992554, -0.021043049171566963, -0.0599520243704319, -0.03138553351163864, 0.03265950828790665, 0.004963710438460112, -0.003248866181820631, -0.04021746292710304, 0.038208190351724625, -0.02256007120013237, 0.10770396143198013, 0.013757425360381603, 0.040707558393478394, -0.00694271270185709, -0.012331271544098854, 0.004992029629647732, -0.032236646860837936, 0.01055158581584692, 0.04604483023285866, 0.09973260760307312, 0.07322807610034943, 0.06853726506233215, 0.004230210557579994, -0.04007832333445549, 0.16341225802898407, -0.01683313027024269, -0.01998194307088852, -0.035919081419706345, -0.055582448840141296, 0.008072910830378532, -0.0054771858267486095, -0.013343624770641327, 0.014230597764253616, -0.06542462855577469, 0.015897123143076897, -0.06011642515659332, 0.07983837276697159, 0.026512078940868378, 0.014883842319250107, -0.015171286650002003, 4.1508101276122034e-05, -0.048078570514917374, -0.052594274282455444, -0.07897629588842392, -0.01334046758711338, -0.06180298700928688, 0.022423526272177696, 0.07393807917833328, 0.022332284599542618, 0.04279463365674019, 0.04075624793767929, 0.007524204906076193, -0.024405587464571, 0.0011822516098618507, -0.0019135301699861884, 0.10789427906274796, -0.040499038994312286, 0.011574117466807365, 0.048836030066013336, 0.0380941741168499, -0.047072283923625946, -0.01285380870103836, -0.038019485771656036, -0.06277137994766235, -0.0034404860343784094, -0.031123748049139977, 0.04279843345284462],
@@ -47,7 +47,6 @@
         "CVTSS":[-0.06638028472661972, -0.011326023377478123, 0.008208844810724258, 0.007368308026343584, 0.009791173972189426, -0.03396046161651611, 0.02250068075954914, -0.057750262320041656, -0.04949551820755005, 0.02559898979961872, -0.025012727826833725, -0.05923935025930405, 0.005058884620666504, 0.008716589771211147, -0.017511164769530296, -0.07095059007406235, -0.06573225557804108, -0.028140492737293243, 0.11092227697372437, 0.02664722129702568, -0.01997300609946251, 0.0798712745308876, -0.022800235077738762, 0.09157945215702057, 0.025709187611937523, -0.09037603437900543, -0.07092109322547913, -0.04094154015183449, -0.025702493265271187, 0.015247789211571217, 0.06089004501700401, 0.051023274660110474, -0.04670926183462143, 0.04763137549161911, -0.035940639674663544, 0.002320673782378435, -0.005764417815953493, -0.07975194603204727, -0.0038822791539132595, 0.06728507578372955, 0.020742014050483704, 0.08809743821620941, -0.061493389308452606, -0.0485445000231266, -0.022268671542406082, 0.08475345373153687, -0.0030403153505176306, -0.05737586319446564, -0.07930854707956314, -0.01657176949083805, 0.04658877104520798, 0.005716703832149506, -0.04288295656442642, -0.08686209470033646, -0.07359853386878967, 0.02947128191590309, -0.03684910386800766, -0.03841136023402214, 0.01288131158798933, -0.04918907582759857, -0.05579863488674164, 0.06267702579498291, -0.0034505922812968493, 0.034628838300704956, 0.04280426353216171, 0.042202845215797424, 0.012274117209017277, 0.025021208450198174, -0.07867497205734253, 0.03826712444424629, 0.017088277265429497, 0.037250861525535583, -0.016143174842000008, -0.06754780560731888, -0.013957766816020012, 0.1060054823756218, 0.014829001389443874, 0.06808885931968689, 0.022929415106773376, -0.10870063304901123, -0.002258410444483161, 0.009293666109442711, 0.08529872447252274, -0.018672339618206024, -0.06721168756484985, 0.04180533438920975, -0.0031767592299729586, -0.023869113996624947, -0.00011912015179404989, -0.034519728273153305, 0.0022619885858148336, -0.00573525857180357, -0.033912476152181625, 0.059763263911008835, -0.048703599721193314, -0.07433722168207169, 0.04105979949235916, 0.0022583131212741137, 0.03093089908361435, -0.05187990516424179],
         "CVTTSD":[-0.08537309616804123, 0.0010597433429211378, 0.07481679320335388, 0.05997887998819351, -0.0376993790268898, 0.10309506952762604, 0.07795511186122894, 0.0833413377404213, 0.056095756590366364, 0.05851535126566887, -0.057075001299381256, 0.020756129175424576, -0.08901876956224442, 0.02559811621904373, -0.016971183940768242, -0.04282280057668686, -0.005386374890804291, -0.06672719866037369, -0.09664622694253922, 0.06042492762207985, -0.042353514581918716, 0.06194235011935234, -0.025712836533784866, -0.029526079073548317, 0.044016264379024506, 0.036507125943899155, -0.038406822830438614, 0.006118632387369871, -0.0495009683072567, -0.07487531006336212, -0.07304015755653381, 0.042621925473213196, -0.06314127147197723, 0.03934277594089508, -0.09373295307159424, -0.05887934938073158, 0.010626542381942272, -0.050934500992298126, -0.037448156625032425, 0.01178495679050684, -0.07045318186283112, 0.10210251808166504, -0.07279546558856964, 0.04947654530405998, -0.039519909769296646, 0.07030976563692093, -0.011039734818041325, 0.01187387015670538, -0.0840335488319397, -0.005615191534161568, -0.06869980692863464, -0.012282256036996841, -0.013054385781288147, -0.0711965560913086, 0.015505223535001278, 0.0693473145365715, 0.012862266041338444, -0.04747828096151352, 0.023439936339855194, 0.03891129791736603, -0.04998844489455223, -0.04673001170158386, 0.02121424488723278, 0.0501207634806633, 0.07420068979263306, -0.014888633042573929, 0.007586659397929907, 0.01340668834745884, -0.09216003119945526, 0.09335170686244965, 0.023272672668099403, 0.030810026451945305, 0.05792044475674629, -0.020374637097120285, -0.02717672660946846, 0.028085753321647644, 0.08691198378801346, 0.061656054109334946, -0.07689087092876434, 0.0407567173242569, 0.010403914377093315, -0.03389676660299301, 0.07075867801904678, 0.002534526167437434, -0.026066122576594353, 0.005012217443436384, 0.08335569500923157, -0.02732011303305626, 0.03854125738143921, 0.03336648270487785, -0.10646265000104904, -0.003997548017650843, 0.09871185570955276, 0.0275016650557518, 0.015653448179364204, 0.07066125422716141, 0.05811227858066559, 0.046357106417417526, 0.047027964144945145, 0.07407277077436447],
         "CVTTSS":[-0.07762601226568222, 0.051891617476940155, 0.02840222790837288, 0.012996217235922813, -0.04709569737315178, -0.011790127493441105, 0.07787185907363892, 0.07411551475524902, 0.04010153189301491, 0.000911108567379415, -0.09610971063375473, 0.042953960597515106, 0.01613607630133629, -0.07504888623952866, -0.04967263713479042, 0.06148393824696541, -0.018901845440268517, 0.08033818751573563, -0.06893469393253326, -0.036083199083805084, 0.08206851035356522, 0.08462843298912048, 0.06728347390890121, -0.03210798278450966, -0.019102206453680992, 0.0723310112953186, 0.009836986660957336, -0.057902153581380844, 0.007954364642500877, -0.015247606672346592, 0.08317636698484421, -0.030078981071710587, -0.003329804167151451, -0.00047014118172228336, -0.02859017252922058, -0.07635723054409027, -0.008230162784457207, 0.03107159025967121, -0.009525406174361706, 0.06515175849199295, -0.06525594741106033, -0.028639627620577812, -0.0781184732913971, 0.009911812841892242, 0.011008340865373611, -0.04294031485915184, -0.04256690293550491, -1.394751961925067e-05, -0.029347950592637062, -0.031849224120378494, 0.012988862581551075, -0.0009693846222944558, -0.019299298524856567, 0.0045416890643537045, -0.04690401256084442, -0.04800841212272644, 0.0020325451623648405, -0.02004505693912506, 0.04130777344107628, -0.033602941781282425, -0.06956057250499725, -0.008079515770077705, 0.0033002288546413183, 0.03853915259242058, 0.08760882169008255, -0.04805464297533035, 0.02319355681538582, 0.018974801525473595, -0.08521144837141037, -0.05224936082959175, -0.023577861487865448, 0.01627342589199543, 0.024244949221611023, 0.09439363330602646, -0.007235093507915735, 0.055853620171546936, -0.00885567907243967, 0.02217228338122368, 0.05414341762661934, -0.0278383269906044, -0.000764147553127259, 0.045272815972566605, -0.009049531072378159, 0.05590446665883064, -0.05074811726808548, -0.06311893463134766, -0.026139337569475174, 0.01067473366856575, -0.043730076402425766, -0.07134802639484406, -0.11087869852781296, 0.05522335693240166, -0.07894640415906906, -0.06710508465766907, -0.022497203201055527, 0.0777427926659584, -0.07944057136774063, 0.05494234338402748, -0.04788406938314438, -0.032921578735113144],
-        "ConstantPoolIndex":[0.041396364569664, -0.032536957412958145, -0.01450332161039114, -0.006678386591374874, 0.058945223689079285, 0.02544882893562317, -0.03047209233045578, -0.07739393413066864, -0.09328317642211914, -0.01668739691376686, -0.024649402126669884, -0.0379607230424881, -0.11910244077444077, -0.020992999896407127, -0.007654233835637569, -0.005232746247202158, -0.05641235038638115, -0.030478237196803093, -0.11095637828111649, -0.029757868498563766, 0.007831704802811146, -0.06478779017925262, -0.029330771416425705, -0.016729608178138733, 0.016851121559739113, -0.08636923134326935, 0.09819734841585159, -0.06862954050302505, -0.054081980139017105, -0.11573795974254608, 0.025045182555913925, -0.045820001512765884, -0.03937136381864548, -0.0006095073185861111, 0.010480350814759731, 0.04263518005609512, -0.07309181243181229, 0.030367357656359673, 0.05174611508846283, -0.07616177201271057, 0.08458246290683746, -0.05704038590192795, -0.08539492636919022, -0.027642514556646347, -0.01617196388542652, 0.025178344920277596, 0.009598441421985626, -0.02391812391579151, -0.007018273696303368, 0.08220435678958893, 0.019317878410220146, -0.07800780981779099, 0.008812256157398224, -0.08796992152929306, -0.018406951799988747, 0.06285018473863602, 0.0247958917170763, -0.010797450318932533, 0.042904313653707504, 0.04307369515299797, 0.03591239079833031, 0.0318138487637043, -0.052741825580596924, -0.05960077419877052, 0.05289359390735626, -0.07335714250802994, -0.07966916263103485, 0.06509458273649216, -0.014078558422625065, 0.05966315418481827, -0.10191051661968231, 0.038503143936395645, 0.08414285629987717, -0.09167703986167908, -0.03125883638858795, 0.00029595239902846515, -0.05052953213453293, 0.06109768897294998, 0.027757229283452034, 0.07064288854598999, 0.025423981249332428, 0.04430470988154411, 0.006646708585321903, 0.011614424176514149, -0.058028463274240494, -0.026873555034399033, -0.045714568346738815, -0.009242760017514229, -0.08255617320537567, 0.03060135245323181, -0.019932182505726814, -0.07189206779003143, 0.01935136877000332, 0.05297813192009926, 0.004497232846915722, -0.08383949100971222, -0.0008196682319976389, 0.03524069860577583, 0.023135961964726448, 0.00863903108984232],
         "DEC":[0.0634445771574974, -0.06605149805545807, 0.03212125599384308, 0.030006375163793564, -0.08837386220693588, -0.016591178253293037, -0.03157195448875427, 0.005282422062009573, 0.04301748052239418, -0.035375431180000305, -0.050481121987104416, -0.10733921080827713, -0.03802337497472763, -0.0745977833867073, -0.03943190351128578, -0.014895747415721416, 0.004689200781285763, -0.05872263386845589, -0.02043316885828972, 0.017881838604807854, -0.02151746302843094, 0.049130357801914215, -0.0980888232588768, -0.0012140831677243114, -0.03892286866903305, -0.050167523324489594, -0.06817777454853058, 0.011282221414148808, 0.0848090872168541, -0.04859968274831772, -0.005405630450695753, 0.09327276051044464, -0.031913015991449356, -0.07784294337034225, -0.039762917906045914, -0.0004000961489509791, -0.03763844072818756, -0.024915525689721107, 0.04509890824556351, 0.05546657368540764, -0.055939678102731705, -0.0467451736330986, -0.030023904517292976, -0.010519847273826599, 0.009574057534337044, 0.023444844409823418, 0.007250144146382809, 0.060414351522922516, -0.0011268716771155596, -0.10112253576517105, -0.068567655980587, -0.044332459568977356, 0.0022569731809198856, -0.012019195593893528, 0.0016708170296624303, 0.01029527559876442, -0.024694599211215973, -0.0428738109767437, 0.053816765546798706, 0.09999147802591324, 0.06608963757753372, 0.014324366115033627, 0.022997796535491943, -0.012565241195261478, -0.008212191984057426, -0.012308428063988686, -0.09830988198518753, -0.04177428036928177, 0.03759279474616051, 0.06749766319990158, -0.08330990374088287, -0.06375840306282043, 0.0471678152680397, 0.06524914503097534, 0.09668447077274323, 0.07395336031913757, -0.06081546097993851, 0.0322561152279377, -0.05461571738123894, 0.022349894046783447, 0.0981096625328064, 0.019211066886782646, 0.10566835105419159, 0.004508140496909618, 0.030159158632159233, 0.1076640635728836, -0.004145997576415539, 0.08043811470270157, 0.030684711411595345, 0.07909402251243591, -0.015952520072460175, 0.027102122083306313, 0.017120881006121635, 0.0860346332192421, 0.06145261228084564, -0.01827210932970047, 0.027506740763783455, 0.08201386034488678, -0.09402544051408768, -0.07927247136831284],
         "DIV":[0.08121486008167267, -0.06398852169513702, -0.007856910116970539, 0.09644383192062378, 0.0013691268395632505, 0.03523438796401024, -0.04342259466648102, -0.011761687695980072, 0.021194210276007652, -0.0386938601732254, -0.004948849324136972, -0.08348845690488815, 0.005121953319758177, -0.06682730466127396, -0.004115825518965721, 0.015023703686892986, 0.042783256620168686, 0.08872916549444199, -0.03392689675092697, -0.014770613051950932, 0.001988545060157776, -0.05145770683884621, -0.029310323297977448, 0.06324473023414612, -0.08066411316394806, 0.006997138261795044, 0.004352204035967588, -0.060964930802583694, 0.02948148362338543, 0.052747759968042374, -0.05635778605937958, -0.014655586332082748, 0.015838103368878365, -0.04539657384157181, 0.031915292143821716, 0.05234432592988014, -0.012030252255499363, 0.06431112438440323, -0.027869969606399536, -0.006431832443922758, 0.025956276804208755, 0.047651831060647964, -0.01758543774485588, 0.07249220460653305, -0.049627624452114105, -0.007435495033860207, 0.0015833197394385934, 2.190603845519945e-05, 0.03457536920905113, 0.03895196691155434, -0.037442032247781754, 0.003120564157143235, -0.0690622553229332, -0.04405339062213898, 0.016464274376630783, -0.05068953335285187, 0.009520933963358402, 0.05033525079488754, 0.030095860362052917, 0.08773164451122284, -0.03623930364847183, -0.0076989103108644485, 0.0133424773812294, 0.025229837745428085, 0.018198521807789803, 0.011319941841065884, -0.005582685582339764, -0.03598775342106819, -0.0565820187330246, 0.08609189838171005, 0.035601116716861725, -0.007436969317495823, -0.018040914088487625, -0.04825054481625557, -0.014956142753362656, 0.03343576192855835, -0.0739198625087738, 0.038971979171037674, -0.03691745549440384, -0.0371851809322834, 0.08137080073356628, 0.03924981504678726, -0.06499960273504257, 0.047913506627082825, -0.0464070662856102, 0.04404731094837189, -0.03972303494811058, 0.03341617435216904, 0.05367732420563698, -0.04457789286971092, -0.07455608248710632, 0.007865827530622482, 0.04562194645404816, -0.03552774339914322, -0.007738951593637466, 0.09388759732246399, -0.015701837837696075, 0.033921483904123306, -0.017276542261242867, 0.04943705350160599],
         "DIVPDrm":[0.04179735854268074, 0.008989601396024227, 0.0027430830523371696, 0.06804384291172028, -0.06657993793487549, 0.033647675067186356, -0.03707171231508255, 0.08443991839885712, -0.054565757513046265, 0.0765392854809761, -0.08189049363136292, 0.02573087066411972, 0.018917549401521683, 0.079402856528759, -0.011117411777377129, 0.06308865547180176, -0.045432765036821365, -0.05054701492190361, -0.009618235751986504, -0.0594516322016716, 0.07967120409011841, 0.08030137419700623, -0.0768255814909935, -0.061036787927150726, 0.004279104992747307, -0.09737113863229752, 0.07295801490545273, -0.027599459514021873, 0.0045133912935853004, -0.048141367733478546, 0.0003157609316986054, -0.014835191890597343, 0.01462356187403202, -0.03225003555417061, 0.06723359227180481, 0.05244021862745285, 0.07099424302577972, -0.09206876158714294, 0.06154841184616089, -0.022400988265872, 0.034042902290821075, 0.002528816694393754, -0.04578591138124466, -0.023195132613182068, -0.07696253061294556, -0.03475971147418022, 0.03545870631933212, -0.021839862689375877, 0.0036371496971696615, 0.07372148334980011, -0.0596211701631546, -0.06768393516540527, -0.032637521624565125, 0.008432515896856785, 0.007569535635411739, -0.0034237385261803865, 0.05811845883727074, 0.013580343686044216, -0.03924565017223358, -0.025963587686419487, 0.03800642117857933, -0.04651957005262375, -0.033428385853767395, -0.053251899778842926, -0.04647624120116234, -0.034290049225091934, 0.003906013211235404, -0.05534028634428978, 0.04434804245829582, -0.08216925710439682, -0.011801591143012047, -0.006801240611821413, -0.07483590394258499, -0.06332433968782425, -0.005107037723064423, -0.008274846710264683, -0.07277056574821472, 0.03865613043308258, -0.0472225658595562, 0.009775533340871334, 0.055412523448467255, -0.014846398495137691, -0.008565607480704784, -0.018367087468504906, 0.038180120289325714, 0.06085506081581116, -0.02658388763666153, 0.006586031056940556, 0.0761575847864151, -0.007659312803298235, -0.10445686429738998, 0.01846102997660637, 0.02885548584163189, 0.0437043160200119, -0.012576445005834103, 0.04055696353316307, 0.002144219819456339, -0.08052077144384384, 0.03422001749277115, 0.03888843208551407],
@@ -62,12 +61,9 @@
         "DIV_Fp":[0.0013771128142252564, -0.03939857333898544, 0.06826473772525787, -0.055852942168712616, 0.021110225468873978, -0.07429434359073639, -0.01439732313156128, 0.047745198011398315, 0.03544871136546135, -0.006474921014159918, -0.05228240415453911, 0.00804696511477232, 0.0025021089240908623, 0.049810487776994705, -0.009595588780939579, 0.0507207065820694, -0.040155258029699326, 0.013851179741322994, -0.09630413353443146, -0.012529753148555756, 0.08176414668560028, 0.05994131416082382, 0.0013053410220891237, -0.035347871482372284, -0.06649265438318253, 0.07997933030128479, -0.042037565261125565, -0.06072461977601051, 0.09246786683797836, -0.0072363922372460365, 0.01850724034011364, 0.03905143961310387, -0.07601091265678406, -0.04824458062648773, -0.014410853385925293, -0.06455439329147339, -0.0593516007065773, -0.047922395169734955, -0.07904111593961716, -0.05896637961268425, -0.05629009008407593, -0.08674604445695877, 0.017179397866129875, -0.0020149857737123966, 0.02413070574402809, 0.024688012897968292, 0.027266085147857666, -0.015890855342149734, -0.00813567265868187, 0.024672919884324074, -0.020992467179894447, 0.019298823550343513, 0.022587062790989876, 0.06570186465978622, 0.061541132628917694, -0.07291612029075623, -0.010421186685562134, 0.032753147184848785, -0.06230449676513672, 0.040921296924352646, 0.05855383351445198, -0.035908423364162445, 0.05353318154811859, -0.013773049227893353, 0.0073576089926064014, 0.016397720202803612, 0.03753839433193207, 0.04765179380774498, -0.041083212941884995, -0.013994180597364902, -0.015261827036738396, 0.0982649177312851, 0.05605688691139221, -0.041869863867759705, -0.017181048169732094, 0.03721241280436516, -0.005489564035087824, 0.026647603139281273, -0.07785916328430176, 0.0476430244743824, -0.006558667402714491, 0.06363014876842499, -0.05705825239419937, -0.048359137028455734, -0.09657922387123108, -0.020021332427859306, 0.05151694640517235, 0.0028305412270128727, -0.012787899002432823, -0.09800048917531967, 0.01322718895971775, 0.08181536942720413, -0.04321233555674553, -0.0016350646037608385, -0.03537006303668022, 0.041411954909563065, 0.028577959164977074, 0.01855066418647766, 0.01671769842505455, -0.04467424377799034],
         "DIV_FpI":[-0.010708109475672245, -0.0732470378279686, -0.033443547785282135, -0.06361733376979828, 0.017653197050094604, -0.030770231038331985, -0.0766882598400116, 0.08713997155427933, -0.0696694403886795, 0.0565333366394043, 0.0079630296677351, 0.009157304652035236, 0.07795052230358124, 0.00863052811473608, 0.009487103670835495, -0.021366223692893982, -0.08859013020992279, -0.052845098078250885, 0.07517100870609283, 0.030445149168372154, -0.031006425619125366, -0.011518558487296104, 0.031634584069252014, 0.006774903275072575, -0.0008412582101300359, 0.05720775946974754, -0.03664165362715721, 0.04671872407197952, -0.04702712222933769, 0.08346223086118698, -0.02042539417743683, 0.005731453187763691, -0.02509506233036518, 0.04370206221938133, -0.06398718804121017, 0.052075039595365524, 0.05920809134840965, 0.0037172543816268444, 0.07034561783075333, -0.018100138753652573, 0.002755390014499426, -0.07121799886226654, 0.03879084065556526, 0.013516174629330635, -0.02845778502523899, 0.019500035792589188, 0.014439111575484276, 0.06561631709337234, -0.10264755040407181, -0.016511712223291397, -0.018063146620988846, 0.08819841593503952, -0.0031949833501130342, -0.07884415239095688, -0.10739012062549591, 0.007700629997998476, 0.049550142139196396, 0.0866587832570076, 0.054501283913850784, 0.10046342760324478, 0.01546743419021368, -0.05334487929940224, -0.02652003802359104, -0.009483176283538342, 0.011785115115344524, 0.04965313896536827, -0.030048802495002747, -0.043639082461595535, -0.004809096921235323, -0.0515226274728775, 0.08381897956132889, 0.003956930246204138, 0.03591177612543106, 0.04015829414129257, -0.03484338894486427, 0.04027436673641205, 0.09792015701532364, 0.013287014327943325, 0.09490979462862015, -0.024792836979031563, 0.04872164502739906, 0.026059577241539955, -0.05917894095182419, -0.011415015906095505, -0.024944854900240898, 0.00499876169487834, -0.06721813976764679, -0.03442658111453056, -0.002175490139052272, 0.0004200722905807197, -0.10891042649745941, 0.021674668416380882, 0.03700282797217369, -0.0014478170778602362, -0.013477527536451817, -0.02742406167089939, -0.01233668439090252, 0.02371596358716488, 0.04435785114765167, -0.03723753243684769],
         "EH_LABEL":[-0.03541884943842888, 0.012697970494627953, 0.07690317928791046, 0.10800164937973022, -0.033531446009874344, 0.010248170234262943, 0.08690237253904343, -0.018254421651363373, 0.006330807693302631, 0.054908059537410736, 0.05281105265021324, 0.01866377331316471, -0.03986826166510582, 0.012461063452064991, -0.0570770688354969, 0.010465170256793499, -0.0007985670818015933, 0.014928294345736504, -0.08143509179353714, -0.04576095566153526, -0.014382844790816307, 0.09261113405227661, -0.06843073666095734, 0.08642790466547012, 0.010645134374499321, 0.02887858636677265, -0.08228367567062378, 0.06679805368185043, 0.0023300996981561184, 0.02060936950147152, 0.06778941303491592, 0.10305429995059967, 0.06289057433605194, -0.020899023860692978, -0.024045836180448532, 0.0433543361723423, -0.0338776558637619, -0.05156976729631424, -0.02495928853750229, -0.060252029448747635, -0.1022094339132309, 0.014480574987828732, 0.0545940026640892, 0.04824232682585716, 0.06658189743757248, 0.11414545774459839, 0.08304191380739212, -0.03313761577010155, -0.056730128824710846, -0.005165864247828722, -0.00412571569904685, -0.0007486011018045247, -0.03322497382760048, 0.0425163209438324, 0.0785580724477768, 0.015084332786500454, 0.049294766038656235, 0.07518152892589569, -0.008224403485655785, -0.08819448202848434, -0.020814890041947365, 0.054976895451545715, -0.06431052833795547, 0.026952102780342102, -0.02861913852393627, -0.05228573456406593, -0.08044329285621643, 0.02844928950071335, 0.06669115275144577, -0.005387885496020317, 0.05081101134419441, -0.0627083107829094, -0.0785573348402977, -0.042252350598573685, -0.0632990375161171, -0.042457811534404755, -0.07097408920526505, 0.0032806433737277985, 0.039354246109724045, 0.054314617067575455, 0.04231691733002663, 0.00793430395424366, -0.06007056310772896, -0.06129273772239685, -0.008646488189697266, -0.024291129782795906, -0.06316813081502914, 0.02861824445426464, 0.029990797862410545, -0.014714590273797512, 0.005561451427638531, 0.06847269088029861, 0.05630529299378395, -0.015434914268553257, 0.08646618574857712, -0.0025325058959424496, -0.0046173883602023125, -0.04263639822602272, -0.021261105313897133, 0.02382785640656948],
-        "ExternalSymbol":[0.014755810610949993, -0.049842361360788345, -0.06733497977256775, 0.05401315540075302, 0.061938412487506866, 0.02437831088900566, -0.06823863834142685, 0.03685877099633217, 0.02961423434317112, -0.04944299906492233, -0.1271103173494339, 0.030452819541096687, 0.019848955795168877, -0.03185190260410309, 0.06586895883083344, 0.0007315169204957783, 0.010839227586984634, -0.09547370672225952, -0.01799146644771099, -0.02204788289964199, 0.048699937760829926, 0.004187166225165129, 0.004053634125739336, -0.04464051127433777, -0.005158414598554373, -0.0416896678507328, -0.024279240518808365, -0.05358913540840149, -0.04719633609056473, -0.07180647552013397, 0.02559211477637291, 0.04657098650932312, 0.08353757858276367, -0.0023563469294458628, 0.046847302466630936, -0.03508693352341652, 0.0696689784526825, 0.054716791957616806, -0.012037037871778011, 0.019885245710611343, 0.01824580691754818, -0.06719563156366348, -0.05447190999984741, 0.08877509087324142, -0.01375679112970829, -0.014463561587035656, -0.049798283725976944, 0.06304343044757843, -0.007584648672491312, -0.016156170517206192, 0.024602508172392845, 0.004940119571983814, -0.04088609293103218, 0.0026271860115230083, 0.00787595845758915, -0.01889132149517536, -0.041029710322618484, 0.07343143969774246, -0.02505693957209587, -0.04825644940137863, 0.060728199779987335, 0.00460366066545248, 0.020744791254401207, 0.04238201677799225, -0.024090539664030075, -0.05792662873864174, 0.07639332860708237, -0.07511764764785767, -0.08259762078523636, 0.07901840656995773, -0.000285966758383438, 0.021390466019511223, -0.07818973809480667, -0.02385067008435726, -0.0014113716315478086, -0.055170729756355286, 0.00946732610464096, 0.02471417747437954, 0.07941421121358871, 0.006746167317032814, -0.06766024231910706, -0.089698426425457, 0.01933225803077221, -0.06994582712650299, -0.10149082541465759, 0.06007266044616699, -0.14545120298862457, -0.03447172790765762, 0.03258124738931656, 0.04966919496655464, 0.023691890761256218, -0.014501980505883694, 0.05896589905023575, 0.04760534316301346, -0.017742110416293144, 0.0019451226107776165, -0.01854461058974266, -0.04744676500558853, -0.017504630610346794, 0.05197983980178833],
         "FLDCW":[-0.0138143515214324, 0.021748993545770645, 7.070673746056855e-05, -0.0897645577788353, 0.09824047237634659, -0.07988506555557251, -0.03454058617353439, 0.0019847718067467213, 0.04983500763773918, 0.03934836760163307, -0.01007675752043724, -0.07798215001821518, -0.08095540851354599, 0.002752745756879449, 0.030696945264935493, 0.017224561423063278, 0.00200466881506145, 0.055515315383672714, -0.06178406998515129, -0.07683275640010834, 0.06503588706254959, 0.06047344580292702, 0.017141321673989296, -0.021984437480568886, -0.05550537258386612, -0.10371828079223633, 0.04531969875097275, 0.04299109801650047, 0.008607891388237476, -0.015554985031485558, -0.08462150394916534, 0.01943030022084713, -0.03486369550228119, -0.06457459926605225, -0.0051103211008012295, 0.05992105230689049, 0.0358397401869297, -0.04655934497714043, -0.018018357455730438, -0.057540085166692734, 0.0061888862401247025, -0.013676634058356285, -0.05362136662006378, 0.06076344475150108, 0.014500541612505913, 0.04466172680258751, 0.025775697082281113, 0.034106262028217316, -0.045596618205308914, 0.022729532793164253, 0.0068075573071837425, 0.033541467040777206, 0.04034329950809479, -0.05922241508960724, -0.11147011071443558, 0.10801365971565247, 0.028543133288621902, -0.076783187687397, 0.0018997815204784274, -0.030598029494285583, 0.04199691861867905, -0.09739390015602112, 0.06310229748487473, -0.03830089420080185, -0.03836864233016968, 0.02324736677110195, 0.10289694368839264, -0.08237223327159882, 0.09511970728635788, -0.022883199155330658, 0.07018155604600906, 0.021149639040231705, 0.06003378704190254, 0.020026177167892456, -0.019267164170742035, 0.06961971521377563, -0.004955677315592766, -0.07218261808156967, 0.08104820549488068, -0.0418921560049057, -0.0317075252532959, 0.020996741950511932, -0.009143776260316372, 0.05348548665642738, -0.0625229999423027, -0.06267517060041428, -0.09454416483640671, -0.043331023305654526, -0.06992270052433014, -0.027888890355825424, -0.08271876722574234, -0.05188243091106415, -0.010446823202073574, 0.05846165865659714, -0.010190286673605442, -0.03009830228984356, 0.03426814824342728, -0.03598400205373764, -0.1076725572347641, -0.028831692412495613],
         "FNSTCW":[-0.08537304401397705, 0.014420966617763042, 0.026950713247060776, -0.008387862704694271, -0.0038766334764659405, 0.026867343112826347, -0.030130255967378616, -0.04617878049612045, -0.007106459699571133, -0.0215947013348341, 0.007403566502034664, 0.032729458063840866, 0.0008728280663490295, -0.017559584230184555, 0.017324298620224, -0.014857987873256207, -0.03798896074295044, -0.05294371768832207, 0.05491216480731964, -0.04219334200024605, -0.024796022102236748, 0.033826109021902084, 0.04021430388092995, 0.015585671178996563, -0.025553781539201736, -0.011536196805536747, 0.021523986011743546, 0.01087264809757471, -0.023965656757354736, 0.021311553195118904, -0.0554395355284214, -9.890173532767221e-05, -0.0012819130206480622, -0.055725399404764175, 0.008443817496299744, 0.014645406976342201, 0.09493250399827957, 0.005851465743035078, -0.0346904918551445, -0.018780557438731194, -0.0024646760430186987, -0.04922417551279068, -0.025316428393125534, -0.047623440623283386, 0.04252983629703522, 0.008884137496352196, 0.024444259703159332, 0.11018849164247513, 0.06603030860424042, 0.10775407403707504, -0.06696148216724396, 0.07046543061733246, 0.03569186478853226, 0.06831049919128418, 0.10069368779659271, -0.07917457073926926, 0.07819988578557968, 0.0325605608522892, 0.028253860771656036, -0.03586380183696747, 0.08094784617424011, -0.08532348275184631, 0.08135068416595459, 0.08752897381782532, 0.07736475020647049, 0.03881741315126419, 0.01930568367242813, 0.01373430248349905, 0.07003094255924225, 0.021482432261109352, 0.0606292188167572, 0.005889599211513996, -0.06958997994661331, 0.04857232794165611, 0.09418252855539322, 0.030624384060502052, -0.05853968486189842, 0.0978643149137497, 0.042890243232250214, -0.06594833731651306, -0.00445757107809186, 0.028062766417860985, 0.04270890727639198, 0.049651019275188446, -0.10246159136295319, -0.04101993143558502, -0.06874924898147583, -0.047776881605386734, 0.060615722090005875, 0.022016024217009544, -0.0476866140961647, -0.09320542216300964, -0.06186588481068611, 0.030679777264595032, -0.01664678566157818, -0.02508559450507164, -0.0495455376803875, 0.02986457757651806, 0.0242463406175375, -0.03076062723994255],
-        "FrameIndex":[0.05219179764389992, -0.01926516741514206, -0.021848104894161224, -0.008528115227818489, 0.02989117242395878, -0.012461756356060505, -0.050973404198884964, 0.026713935658335686, 0.01968700997531414, -0.001058116089552641, 0.009182002395391464, 0.03877940773963928, 0.070717453956604, -0.0028735792730003595, 0.0528000183403492, -0.015265910886228085, 0.007753959856927395, 0.01596899703145027, -0.07933179289102554, -0.02578687109053135, 0.02417992427945137, -0.03462255373597145, 0.04385964199900627, 0.004388607107102871, 0.03716951236128807, 0.04064105078577995, 0.07711678743362427, 0.0068300217390060425, -0.05443308874964714, -0.010809220373630524, -0.03124961629509926, 0.004911563824862242, -0.09201066941022873, 0.051436200737953186, 0.015400445088744164, 0.07804328948259354, -0.02971532940864563, -0.0003241244703531265, -0.02131350338459015, -0.09173687547445297, -0.01707594096660614, 0.0025449323002249002, 0.08701702952384949, 0.10675988346338272, -0.05082142353057861, 0.021581847220659256, -0.04104776680469513, 0.08402986079454422, -0.06109907105565071, 0.015201682224869728, 0.04374992102384567, -0.028573378920555115, -0.07767742872238159, 0.07216905802488327, 0.020538095384836197, -0.01229778677225113, 0.003033912740647793, -0.0007747758063487709, -0.09185474365949631, -0.02851664461195469, -0.009441743604838848, 0.05500328913331032, -0.002983751241117716, -0.09198789298534393, -0.051319632679224014, -0.054626885801553726, -0.020108554512262344, 0.0010591084137558937, -0.009138713590800762, 0.07223176956176758, -0.022099260240793228, 0.016025206074118614, -0.05320229008793831, 0.025131219998002052, 0.06626036763191223, 0.07639450579881668, -0.027084894478321075, 0.06581225991249084, -0.017618829384446144, -0.03859466314315796, -0.03385398909449577, 0.018783841282129288, -0.0730312392115593, 0.06957981735467911, -0.03065340407192707, 0.020685074850916862, -0.05311165004968643, 0.09466810524463654, 0.00955914705991745, -0.013919183053076267, -0.05540250986814499, -0.03087283857166767, -0.009688221849501133, 0.016239993274211884, -0.012926830910146236, -0.027712060138583183, -0.06342892348766327, -0.011996395885944366, 0.05536693334579468, -0.04359230771660805],
         "FsFLD":[-0.0508677139878273, -0.05399654433131218, -0.07149481028318405, -0.047971777617931366, 0.0019320917781442404, -0.007547610439360142, 0.0815814733505249, -0.12202084064483643, -0.08665104955434799, 0.03356856107711792, -0.15713559091091156, -0.0400867722928524, -0.006232412997633219, 0.044278621673583984, 0.09549921005964279, -0.029399411752820015, 0.01864752173423767, -0.04044967144727707, 0.05652021989226341, -0.09881851822137833, 0.025765251368284225, -0.02329906076192856, -0.06028103083372116, 0.09247462451457977, -0.04210466891527176, 0.03263019770383835, -0.03578515350818634, 0.0314578041434288, 0.003650028258562088, 0.04645871743559837, -0.010650137439370155, 0.015904754400253296, 0.018990037962794304, -0.005266033578664064, 0.038479309529066086, 0.008642041124403477, -0.049301791936159134, 0.09484748542308807, 0.005372038576751947, -0.08711376041173935, 0.07584445923566818, 0.09458201378583908, -0.00032702009775675833, 0.048093944787979126, -0.08043119311332703, 0.049779392778873444, -0.006967591121792793, -0.07319328933954239, 0.01582382619380951, -0.006244257558137178, -0.011940727941691875, -0.0013992231106385589, -0.028953444212675095, 0.010995968244969845, -0.005534093361347914, -0.04907146096229553, -0.0039899349212646484, 0.05501222237944603, 0.041574396193027496, 0.030038336291909218, -0.0402531623840332, 0.07675039023160934, 0.01103806123137474, -0.006072944961488247, -0.025336718186736107, 0.06967771798372269, -0.025075508281588554, 0.0031819106079638004, -0.015812508761882782, -0.12114851176738739, 0.07704214751720428, 0.1273191273212433, -0.014406625181436539, -0.031106390058994293, -0.0602225735783577, 0.016253838315606117, -0.059025105088949203, -0.04163780063390732, 0.01571997068822384, 0.025686416774988174, 0.032261066138744354, -0.016690189018845558, 0.014042876660823822, 0.009416786953806877, -0.012661219574511051, 0.013285082764923573, 0.03095356747508049, 0.008239349350333214, 0.0444798618555069, -0.05153216794133186, -0.010029821656644344, -0.015202880837023258, 0.06329496204853058, -0.0590473897755146, 0.08585292845964432, -0.08594027906656265, 0.06057215481996536, 0.01079416275024414, -0.04006461799144745, 0.029236430302262306],
-        "GlobalAddress":[0.021709734573960304, -0.03253590315580368, -0.04603651538491249, -0.02350226789712906, 0.02841794677078724, 0.01920732669532299, 0.053104616701602936, 0.03941836208105087, -0.01895466446876526, -0.030471740290522575, 0.010719750076532364, 0.020050356164574623, 0.03648754581809044, -0.021573888137936592, -0.02554452419281006, -3.637039117165841e-05, 0.05989491194486618, -0.006903402041643858, -0.08826262503862381, -0.028047384694218636, -0.04230065643787384, -0.05190899223089218, 0.06145390123128891, 0.0005839569494128227, -4.391977927298285e-05, -0.01880771853029728, 0.09660127758979797, 0.04333353415131569, 0.06461602449417114, -0.06010710820555687, -0.0690189078450203, 0.04574553668498993, -0.07640431076288223, 0.01879746839404106, 0.02076675370335579, 0.04869573190808296, 0.025147439911961555, 0.05311164632439613, 0.05711919441819191, 0.049520380795001984, 0.041169121861457825, -0.0603964701294899, -0.04195070639252663, 0.07676130533218384, -0.015161959454417229, 0.02903268299996853, -0.027548301964998245, 0.04705912992358208, -0.11194053292274475, -0.008245207369327545, -0.07792827486991882, -0.019468743354082108, 0.05482499673962593, -0.0028855702839791775, 0.05478052794933319, 0.07484771311283112, -0.011742575094103813, 0.00923923309892416, -0.05074375122785568, 0.06956734508275986, -0.045990440994501114, 0.007280972320586443, 0.040920473635196686, -0.09143709391355515, -0.06105270981788635, -0.0021254979074001312, -0.09519167989492416, 0.06324268877506256, -0.0693386048078537, -0.05100148543715477, 0.010643817484378815, -0.008162467740476131, -0.08811189234256744, -0.08640385419130325, 0.0077143507078289986, 0.030832089483737946, -0.01504515577107668, 0.07277517020702362, 0.02581198327243328, -0.052599068731069565, -0.06478387117385864, 0.01634707674384117, -0.021173706278204918, 0.030482977628707886, -0.09826494008302689, 0.07716016471385956, -0.10845024883747101, 0.04479274898767471, -0.015128640457987785, -0.03491876646876335, 0.05239150673151016, -0.03427724912762642, 0.06768845021724701, -0.04174086079001427, -0.05136744678020477, 0.0037109211552888155, -0.030324269086122513, -0.06928850710391998, -0.0395960658788681, 0.07726000994443893],
         "IDIV":[-0.03631015121936798, -0.07882149517536163, -0.010781447403132915, -0.025117948651313782, 0.01618420146405697, 0.044446997344493866, 0.011386583559215069, -0.00582836102694273, -0.012903614901006222, 0.006322081200778484, -0.07392880320549011, -0.1300479620695114, -0.05186808854341507, -0.06542935222387314, 0.08297666162252426, 0.03790606930851936, -0.07716395705938339, 0.02288512885570526, -0.038660015910863876, -0.04705967381596565, -0.00015759489906486124, -0.06133948266506195, -0.022438891232013702, -0.012017307803034782, 0.01929904706776142, 0.007114879786968231, 0.00567955756559968, -0.041199274361133575, 0.08304950594902039, 0.044402915984392166, -0.10634922981262207, -0.009510381147265434, 0.009772839024662971, -0.048219580203294754, -0.0321214459836483, 0.008684953674674034, 0.009846106171607971, 0.011280585080385208, 0.0310650784522295, 0.05677618831396103, 0.025418052449822426, -0.022629115730524063, 0.0074129728600382805, 0.1081111952662468, -0.03284893184900284, 0.002745774807408452, 0.05030296742916107, 0.04322626441717148, 0.005321172997355461, 0.03260405734181404, -0.051505692303180695, -0.033541131764650345, -0.03955534100532532, 0.047906432300806046, 0.02181984856724739, -0.0026405092794448137, 0.03350621834397316, -0.10710552334785461, -0.01533215120434761, -0.06872875243425369, -0.015413723886013031, -0.007149300072342157, -0.03660491481423378, -0.003503897227346897, -0.02898445539176464, 0.040071532130241394, 0.019684670493006706, -0.10101661086082458, -0.08199643343687057, 0.05637385696172714, -0.03792939707636833, 0.03106122836470604, -0.0590706542134285, -0.03607700765132904, -0.09597010910511017, -0.005815848242491484, 0.017992950975894928, 0.0007907312246970832, 0.04653536528348923, -0.03997295722365379, 0.006737773306667805, 0.11695551127195358, 0.022216010838747025, 0.041878726333379745, -0.035456813871860504, 0.04327021911740303, -0.03799387812614441, 0.10658515244722366, 0.010188632644712925, 0.09275273978710175, 0.09797771275043488, -0.12400814890861511, 0.03475511074066162, -0.08061601221561432, 0.022533612325787544, -0.11562027782201767, -0.026964085176587105, 0.08614259958267212, -0.025526022538542747, 0.040927182883024216],
         "ILD_Fp":[0.01509383600205183, -0.044326793402433395, -0.051242612302303314, -0.053859174251556396, -0.013097256422042847, -0.06370041519403458, 0.06120477616786957, 0.050328709185123444, -0.04184471070766449, 0.023432370275259018, -0.06435256451368332, 0.02055867575109005, 0.08239544183015823, 0.012251744978129864, -0.05063817650079727, 0.04293346777558327, -0.05919358506798744, -0.03159564360976219, -0.0037220751401036978, -0.001002405071631074, -0.026786377653479576, -0.07405146211385727, 0.044357798993587494, 0.08067265897989273, -0.05229390412569046, -0.06903751194477081, 0.010448710061609745, 0.006885232869535685, -0.052135784178972244, 0.08535145968198776, 0.041820794343948364, -0.020588336512446404, 0.07256042212247849, -0.017755955457687378, -0.032768987119197845, 0.06633710861206055, -0.03427698463201523, -0.10930930078029633, 0.05371936410665512, -0.06794329732656479, -0.014769122004508972, -0.07577606290578842, 0.07853815704584122, -0.09360899031162262, 0.05865737050771713, -0.034065186977386475, 0.05096115916967392, 0.0888199508190155, -0.03904300555586815, 0.03125728294253349, -0.0634637326002121, 0.03385297581553459, 0.027269205078482628, -0.07597903162240982, 0.008366324007511139, -0.03017764538526535, 0.011727942153811455, -0.04941355064511299, 0.027957690879702568, 0.09743025153875351, 0.004836047999560833, -0.028614182025194168, 0.016423141583800316, 0.0895770713686943, 0.025168858468532562, 0.030979957431554794, 0.016665387898683548, 0.025412173941731453, -0.035893514752388, -0.05403519794344902, 0.02931641787290573, 0.07742571830749512, -0.07045850157737732, -0.03433118015527725, -0.03651195392012596, -0.04036823660135269, -0.08663841336965561, 0.05561026185750961, 0.06927209347486496, -0.010819001123309135, -0.10697789490222931, 0.009881369769573212, 0.055065181106328964, -0.06379911303520203, 0.04137800633907318, 0.030417418107390404, -0.03515362739562988, -0.09139228612184525, 0.029920026659965515, 0.027388064190745354, -0.06739232689142227, 0.07639766484498978, -0.044223885983228683, 0.02472294308245182, -0.052025098353624344, 0.014643780887126923, 7.120784721337259e-05, 0.018760213628411293, -0.002873474732041359, 0.015561423264443874],
         "IMPLICIT_DEF":[-0.026583483442664146, -0.03995991870760918, 0.03633055090904236, -0.04622741788625717, -0.02326572686433792, 0.02231338992714882, -0.014788332395255566, -0.09906739741563797, 0.022785643115639687, -0.014632754027843475, -0.1041543111205101, 0.05013664439320564, -0.08690599352121353, -0.08063319325447083, 0.030247388407588005, -0.09707676619291306, 0.03499408811330795, 0.012669776566326618, 0.06481463462114334, -0.040453050285577774, -0.0489707849919796, -0.07584276050329208, 0.001047363504767418, 0.08496157824993134, 0.02357148937880993, -0.06866959482431412, 0.09267362207174301, 0.030527250841259956, -0.031355831772089005, 0.02419896423816681, -0.02442512847483158, 0.029297800734639168, 0.10321355611085892, 0.06579483300447464, -0.012722077779471874, 0.10042434185743332, -0.004708406049758196, 0.007217984646558762, 0.0753282904624939, -0.07088368386030197, -0.07383686304092407, 0.06410741060972214, 0.06312107294797897, 0.06989452987909317, 0.03766098991036415, -0.0008440924575552344, -0.023516006767749786, -0.04153933748602867, 0.07342316210269928, 0.05416297912597656, -0.02841850183904171, 0.04128013551235199, -0.001023625023663044, 0.005061942618340254, -0.06027042120695114, 0.025808431208133698, 0.027118714526295662, -0.08965771645307541, 0.012222534976899624, 0.008590211160480976, -0.01785023882985115, 0.03389652445912361, 0.0038459128700196743, 0.021088456735014915, -0.060241442173719406, 0.052924126386642456, -0.03849414363503456, 0.0044007860124111176, 0.05139085650444031, -0.06002991273999214, 0.026294095441699028, 0.06567239761352539, 0.1145782321691513, -0.02774081937968731, -0.07959162443876266, -0.00901349913328886, -0.09212079644203186, -0.016664501279592514, -0.019095804542303085, 0.05008011311292648, -0.016630882397294044, -0.007292845752090216, 0.01243519689887762, 0.011623953469097614, -0.0202464796602726, 0.08120717853307724, 0.04192841053009033, -0.014358888380229473, 0.0402902215719223, -0.05741799250245094, 0.0023748986423015594, -0.0007613254711031914, -0.11052780598402023, -0.08283583074808121, -0.018524790182709694, -0.09601832926273346, 0.037600427865982056, -0.06403559446334839, -0.08838459849357605, 0.01904650405049324],
@@ -75,12 +71,10 @@
         "INC":[-0.04204729199409485, -0.04558457434177399, -0.004308773670345545, 0.08560862392187119, -0.025844622403383255, -0.01385454647243023, -0.06715847551822662, 0.04059276729822159, 0.0008142509614117444, -0.04987747594714165, 0.05252164602279663, -0.07536070048809052, 0.012251293286681175, -0.01428443193435669, 0.028742481023073196, -0.024608345702290535, 0.009724774397909641, -0.024144234135746956, -0.04345421493053436, -0.03454094007611275, -0.03657921776175499, -0.025569358840584755, 0.04140102490782738, -0.02267373353242874, -0.05346262827515602, -0.07470668852329254, -0.03458420932292938, -0.015982985496520996, 0.013558092527091503, -0.029305797070264816, 0.026653757318854332, -0.00041234202217310667, 0.038508299738168716, 0.08509717136621475, 0.0016276738606393337, -0.013578594662249088, 0.05669381096959114, 0.0274334829300642, 0.023921431973576546, -0.02701006643474102, -0.09357035905122757, 0.07844959199428558, -0.03195708245038986, 0.044196177273988724, 0.017355425283312798, -0.04172753170132637, -0.07773707062005997, 0.018204662948846817, -0.07242465019226074, 0.07735569030046463, 0.03859752044081688, 0.08490721136331558, 0.04661087319254875, 0.015468046069145203, 0.02267235703766346, -0.030244702473282814, -0.043930262327194214, -0.015585970133543015, -0.004605699330568314, 0.0052457586862146854, -0.027553195133805275, -0.06406774371862411, 0.008009923622012138, -0.09624558687210083, 0.07006736844778061, 0.052846722304821014, -0.029392898082733154, -0.0659954622387886, -0.10725440829992294, 0.04428407922387123, 0.02606845460832119, 0.018936248496174812, -0.013534934259951115, 0.03338829427957535, -0.06049540638923645, 0.007389454171061516, 0.030835872516036034, -0.026952944695949554, -0.008518273010849953, 0.07688802480697632, 0.03663042560219765, -0.09961165487766266, -0.02765841968357563, 0.06263019144535065, -0.003026304766535759, -0.0023868512362241745, -0.052803706377744675, 0.04688272252678871, 0.08415349572896957, -0.044724639505147934, 0.01759890839457512, 0.022962408140301704, 0.00944716576486826, -0.084384024143219, -0.02845100499689579, -0.05094959959387779, -0.08001884073019028, 0.0449872724711895, -0.05161838233470917, 0.015422065742313862],
         "INLINEASM":[0.09296883642673492, -0.007579821161925793, 0.05054628103971481, 0.0011402746895328164, -0.02369365282356739, -0.040429845452308655, 0.048763860017061234, -0.012725423090159893, -0.017820369452238083, -0.0700153335928917, -0.00037883210461586714, 0.06301063299179077, 0.0503254272043705, 0.023893356323242188, -0.07308998703956604, 0.058056626468896866, -0.002504807896912098, -0.03528450429439545, -0.0775352418422699, -0.08423604816198349, 0.01841139607131481, 0.07128658145666122, 0.01363592129200697, 0.05391324311494827, 0.04803359508514404, 0.06145099550485611, -0.03153276443481445, 0.019207997247576714, 0.07138897478580475, 0.06972941011190414, 0.06482893973588943, -0.019937975332140923, -0.00694684125483036, 0.0624234639108181, 0.08495642989873886, 0.017590269446372986, -0.0075670769438147545, 0.05114367976784706, 0.031221428886055946, -0.07108655571937561, -0.018287384882569313, 0.035706836730241776, -0.0794610008597374, -0.03627452626824379, -0.06174106150865555, -0.036826081573963165, -0.030408767983317375, 0.008271732367575169, -0.09423738718032837, 0.004248321522027254, 6.044749170541763e-05, 0.011095447465777397, -0.10245273262262344, -0.07278212904930115, -0.00845671258866787, 0.008961541578173637, 0.019341865554451942, 0.010205359198153019, 0.0724569708108902, -0.08050914853811264, -0.057010360062122345, 0.05053231865167618, -0.04844024032354355, 0.057458631694316864, 0.007486356887966394, -0.029497744515538216, 0.009812748059630394, -0.05314056575298309, 0.11012034863233566, -0.0647352784872055, 0.017479702830314636, -0.027027146890759468, -0.015448061749339104, 0.06321517378091812, -0.06948030740022659, 0.030430838465690613, -0.022251488640904427, -0.0358838327229023, 0.020705783739686012, -0.10970951616764069, -0.07724311202764511, 0.03224516287446022, 0.004828427918255329, 0.07738938182592392, -0.0036471053026616573, 0.06867322325706482, -0.07092054188251495, -0.024759342893958092, -0.054835252463817596, 0.019259851425886154, 0.011149682104587555, -0.09652992337942123, 0.050764426589012146, -0.0809553936123848, -0.04605351760983467, 0.0399462915956974, 0.05396333709359169, -0.01706104166805744, -0.031266387552022934, 0.020599452778697014],
         "IST_Fp":[-0.046584248542785645, -0.07452045381069183, 0.03998621925711632, 0.03091888502240181, 0.016272397711873055, -0.00985297653824091, -0.007199955638498068, -0.03536335751414299, 0.01673988439142704, 0.07562774419784546, 0.023876583203673363, -0.008683494292199612, 0.04009688273072243, -0.03663905709981918, -0.014492983929812908, 0.07349997758865356, 0.028999919071793556, -0.07499339431524277, -0.03727814927697182, -0.046455491334199905, -0.032447993755340576, 0.02374599315226078, -0.044662121683359146, -0.025333719328045845, 0.037562429904937744, 0.0006656686891801655, -0.00804421491920948, 0.06697870045900345, 0.04367857426404953, -0.0583018884062767, 0.03050180710852146, 0.053111929446458817, -0.04168881103396416, -0.027295507490634918, 0.057777389883995056, 0.08833678811788559, -0.026598922908306122, 0.005393106956034899, -0.05517015606164932, -0.0731138065457344, 0.07386088371276855, -0.07228095829486847, 0.023828018456697464, -0.0025013380218297243, -0.012031037360429764, 0.029700662940740585, -0.101964570581913, 0.0899822935461998, 0.013285316526889801, 0.002607472240924835, 0.04784732311964035, -0.044669900089502335, -0.04348702356219292, -0.07007527351379395, -0.016267215833067894, 0.059609103947877884, -0.036534957587718964, 0.013465121388435364, 0.10186120122671127, 0.015473871491849422, -0.08443709462881088, -0.004981503821909428, 0.06996916979551315, 0.011159068904817104, -0.07315052300691605, 0.024891534820199013, 0.0426689088344574, 0.008847315795719624, -0.06540054082870483, -0.09095568209886551, 0.053956128656864166, -0.010535894893109798, 0.035168495029211044, 0.04921877756714821, -0.07781729847192764, 0.006958760786801577, -0.05714801698923111, -0.06458019465208054, -0.055241748690605164, -0.007552466355264187, -0.02490214817225933, -0.014270482584834099, 0.03710750862956047, 0.003406278323382139, -0.044638775289058685, -0.09159127622842789, -0.025353819131851196, -0.07952282577753067, -0.02874225378036499, -0.06654132902622223, 0.0031955954618752003, 0.0602104589343071, -0.09261002391576767, -0.06175351142883301, 0.01194009743630886, -0.0348934531211853, 0.04460763558745384, -0.08773446083068848, 0.04335169121623039, 0.054603610187768936],
-        "Immediate":[-0.039664868265390396, 0.028720445930957794, -0.057207897305488586, 0.04179477319121361, 0.04477043077349663, 0.020050648599863052, -0.056656818836927414, -0.025030966848134995, -0.04394019395112991, 0.04849115386605263, 0.012325904332101345, 0.06731707602739334, 0.04568001255393028, -0.04773757979273796, -0.012142524123191833, -0.03986259177327156, -0.027249159291386604, -0.04930245876312256, -0.10542229562997818, -0.05678592994809151, -0.038303568959236145, -0.07283245027065277, 0.0217409897595644, -0.01139344647526741, 0.006936497986316681, -0.04702157527208328, 0.09977010637521744, -0.035237088799476624, 0.028822069987654686, -0.0691431537270546, -0.0829710066318512, -0.1289154589176178, -0.08470306545495987, -0.06731563061475754, 0.06642980873584747, 0.026025734841823578, -0.04049745202064514, 0.030080674216151237, 0.04203929752111435, 0.06834205985069275, 0.04315062239766121, 0.00788890291005373, 0.03426999971270561, 0.08819636702537537, 0.004112098831683397, 0.03392210975289345, 0.010541473515331745, 0.08045777678489685, -0.02914009988307953, 0.0624285452067852, 0.03299122676253319, -0.05355033650994301, -0.07568570226430893, 0.08106201142072678, 0.0376802459359169, -0.04886564612388611, -0.10992937535047531, -0.00761816743761301, -0.014918084256350994, 0.03816765174269676, -0.04981819912791252, 0.00031993765151128173, 0.011382698081433773, -0.029902901500463486, -0.0117422454059124, -0.057965945452451706, -0.09519924223423004, 0.020727403461933136, -0.04526710882782936, 0.09883677959442139, 0.018033087253570557, -0.003035350237041712, -0.06968960911035538, -0.09893210977315903, -0.01264366414397955, 0.017397744581103325, -0.08519260585308075, 0.09382850676774979, -0.055508699268102646, -0.026548130437731743, -0.013868317008018494, -0.03162496164441109, 0.06089535728096962, -0.01583624631166458, -0.060260944068431854, 0.06709896773099899, -0.09333796799182892, -0.02887417934834957, -0.03424007445573807, -0.01687423326075077, 0.11968979239463806, -0.08361987769603729, 0.09037765115499496, -0.04322688281536102, -0.040831610560417175, -0.061376459896564484, -0.03485504537820816, 0.016033072024583817, 0.004106835462152958, -0.03354674205183983],
         "Int_MemBarrier":[0.0418969988822937, -0.06285926699638367, -0.018717624247074127, -0.0031687396112829447, 0.04023218899965286, 0.08492552489042282, -0.06942103803157806, 0.005588027182966471, -0.08964942395687103, 0.055396437644958496, -0.06732998788356781, 0.06981600075960159, -0.05258888751268387, -0.06051918491721153, 0.02948639541864395, -0.04473342001438141, 0.01574157550930977, -0.04423875734210014, -0.053338322788476944, 0.008577392436563969, 0.10632415115833282, 0.040030092000961304, 0.02552260458469391, 0.026821544393897057, -0.05510386824607849, 0.05976655334234238, -0.0008300095796585083, 0.06861157715320587, -0.049591872841119766, -0.07650840282440186, -0.004643433261662722, -0.03990425914525986, 0.06366871297359467, -0.014906020835042, -0.06371121108531952, 0.0194997675716877, -0.07784571498632431, 0.029953552410006523, 0.06530797481536865, -0.09173597395420074, 0.021494632586836815, 0.052978403866291046, -0.001283245743252337, -0.05061378329992294, 0.04639996960759163, 0.06478390842676163, -0.015909312292933464, 0.013739313930273056, -0.06675873696804047, -0.0704226866364479, 0.020883914083242416, 0.07323179394006729, -0.0010066484101116657, -0.002373248105868697, -0.07056596130132675, 0.024577656760811806, 0.04880139231681824, -0.038608577102422714, 0.07695038616657257, 0.002806240925565362, 0.006876204162836075, 0.006961337756365538, 0.059363361448049545, 0.021191507577896118, -0.06366844475269318, -0.015020458959043026, -0.0815785601735115, 0.004222068004310131, -0.07691111415624619, 0.02711009606719017, 0.014720573090016842, 0.022912023589015007, 0.05272422730922699, 0.08111070841550827, -0.018083568662405014, -0.0418405644595623, 0.08496879786252975, -0.04420621693134308, 0.090696781873703, -0.02872851863503456, -0.024066468700766563, 0.07789512723684311, -0.012021118775010109, 0.041637614369392395, 0.07615016400814056, -0.042834896594285965, 0.05792360380291939, -0.051077719777822495, -0.05241186544299126, 0.006270663347095251, -0.008865885436534882, -0.09101007878780365, 0.009276151657104492, 0.036050815135240555, -0.06729964166879654, -0.014552133157849312, -0.06943532824516296, -0.023805340752005577, -0.058313168585300446, -0.04949163272976875],
         "JCC_":[-0.03625413775444031, -0.041811503469944, -0.07486920803785324, -0.05052778869867325, 0.021635157987475395, -0.045879144221544266, 0.014834613539278507, -0.03941917419433594, -0.010327291674911976, -0.08194752782583237, 0.049111511558294296, 0.05970187485218048, 0.03878019377589226, -0.08208157867193222, 0.11816514283418655, -0.0021148237865418196, 0.022616155445575714, 0.02145639806985855, -0.056387223303318024, -0.07890307158231735, 0.049655016511678696, -0.09555239230394363, -0.07599814981222153, 0.04143097624182701, -0.029399001970887184, 0.01379090640693903, 0.04894237220287323, 0.04915700852870941, 0.020924754440784454, 0.11983200162649155, -0.045743830502033234, 0.04826069250702858, 0.06473162770271301, 0.032176557928323746, 0.012342192232608795, 0.03632035106420517, -0.011231182143092155, 0.03319219872355461, 0.012383898720145226, 0.017726020887494087, -0.027707353234291077, 0.052987076342105865, -0.06459034234285355, 0.03180805966258049, 0.038370322436094284, -0.018640436232089996, -0.05121193453669548, -0.052741218358278275, 0.0953487753868103, 0.0914265364408493, 0.08409767597913742, -0.009599939920008183, 0.02045055478811264, 0.009363643825054169, -0.00872961338609457, -0.08178623765707016, -0.008178372867405415, -0.005903102457523346, 0.05836755037307739, 0.011602274142205715, -0.02761419117450714, 0.016957316547632217, 0.04471946507692337, 0.005247261840850115, -0.05416998639702797, 0.00770663283765316, -0.06152857095003128, 0.021657155826687813, -0.04485960677266121, -0.0008541923016309738, 0.053551655262708664, 0.062185727059841156, -0.012641278095543385, -0.020507624372839928, -0.02900690771639347, 0.019629495218396187, 0.05620177462697029, -0.07772354781627655, -0.025509009137749672, 0.01923682540655136, 0.03035508468747139, 0.018665296956896782, 0.013450516387820244, 0.06740278005599976, 0.013274379074573517, 0.011593983508646488, 0.02331095188856125, 0.048694003373384476, 0.05861792340874672, -0.021130137145519257, 0.02437412552535534, 0.059087324887514114, 0.024816056713461876, -0.050772879272699356, -0.01114521361887455, -0.028665395453572273, -0.09630053490400314, 0.0039062038995325565, -0.08236120641231537, 0.019473683089017868],
         "JMP":[-0.021766331046819687, -0.021576769649982452, -0.03795000538229942, 0.10449998080730438, -0.037742577493190765, -0.009156269021332264, 0.015289359726011753, 0.03519408404827118, -0.034353505820035934, 0.03226960077881813, 0.07340928167104721, 0.06086661294102669, 0.05736850947141647, -0.01725650765001774, -0.06702736765146255, -0.014972181059420109, 0.03435607627034187, 0.012023050338029861, 0.03370668366551399, -0.022338073700666428, -0.08280093967914581, -0.08060947060585022, 0.012210523709654808, -0.08165933936834335, 0.0016056479653343558, 0.015586943365633488, 0.11792927235364914, 0.06917431950569153, 0.02870137430727482, 0.01961304247379303, -0.027661900967359543, 0.10504695773124695, -0.03640349581837654, -0.01896090805530548, -0.011636625044047832, -0.04474593698978424, -0.029941411688923836, -0.058342345058918, 0.05885041877627373, 0.05553867667913437, 0.03953809291124344, 0.06787443161010742, 0.002061075298115611, 0.027305128052830696, 0.05792280659079552, 0.08001891523599625, 0.026575665920972824, -0.0171738862991333, -0.010685772635042667, -0.05422135442495346, 0.03660969436168671, 0.03091355785727501, -0.05900857225060463, 0.08500046283006668, -0.08218419551849365, 0.061078935861587524, 0.018783383071422577, 0.047520000487565994, -0.00014930205361451954, 0.002577823819592595, -0.06816059350967407, 0.041743114590644836, 0.03372296690940857, 0.016127480193972588, -0.07235685735940933, 0.024466760456562042, -0.03468412905931473, 0.037008773535490036, -0.060657840222120285, 0.016427740454673767, 0.08229042589664459, -0.061172664165496826, -0.009794612415134907, -0.024358782917261124, -0.06573519110679626, 0.09360098838806152, -0.07428182661533356, -0.02529928646981716, 0.09198813885450363, 0.025180503726005554, 0.03200048953294754, 0.018081925809383392, 0.0034776402171701193, 0.07848992198705673, -0.00043209362775087357, -0.01768604852259159, -0.043686315417289734, 0.04550321400165558, 0.11878672987222672, -0.008190528489649296, 0.003286525374278426, 0.06845948845148087, 0.04892893135547638, -0.053277406841516495, -0.016919657588005066, 0.032096169888973236, 0.02839065156877041, -0.01713993400335312, -0.15167304873466492, -0.02013365738093853],
         "JMP_":[-0.014233234338462353, -0.0260892603546381, -0.13750334084033966, -0.050227466970682144, -0.042988359928131104, -0.027947310358285904, 0.08639533072710037, -0.16317786276340485, -0.03907149285078049, -0.05328908935189247, -0.03975899517536163, 0.04182944446802139, -0.010540750809013844, -0.11645861715078354, -0.012753792107105255, 0.002367585664615035, -0.05188040807843208, 0.0033823091071099043, -0.01240340806543827, -0.06099176034331322, -0.0015601427294313908, -0.11171454191207886, -0.04928319901227951, 0.05990544706583023, 0.015553089790046215, 0.04499414563179016, -0.034520961344242096, 0.07318194955587387, 0.013978325761854649, 0.07317976653575897, -0.029100794345140457, -0.09544635564088821, 0.030067358165979385, 0.057544808834791183, 0.005057932808995247, 0.005621553864330053, -0.03627946600317955, -0.0391962006688118, 0.03113878332078457, -0.02958066016435623, 0.012381716631352901, 0.011978821828961372, 0.13839371502399445, 0.010590317659080029, 0.06677765399217606, 0.046147286891937256, -0.05033441260457039, -0.020135121420025826, 0.032657306641340256, -0.05044032260775566, 0.05499301478266716, 0.07406507432460785, -0.0011679750168696046, 0.000989275984466076, 0.029161963611841202, 0.02679276280105114, 0.024040302261710167, 0.0710899606347084, -0.0035478041972965, -0.03730632737278938, -0.014350024051964283, 0.1638166308403015, -0.10163120925426483, 0.02900329977273941, -0.05366139113903046, 0.07186686992645264, -0.041340481489896774, 0.0401119627058506, -0.002295189071446657, -0.07949572801589966, -0.011504769325256348, 0.10675538331270218, -0.012056156061589718, -0.00748586468398571, -0.039624687284231186, -0.03555607795715332, -0.06799864768981934, -0.04550764709711075, -0.03302829712629318, -0.008404256775975227, 0.10563746094703674, -0.026095328852534294, 0.07613116502761841, 0.02101682499051094, 0.018749620765447617, 0.0056787943467497826, 0.005889789201319218, 0.03994893655180931, 0.05512934923171997, -0.004684743471443653, -0.01083239447325468, 0.0003112686099484563, 0.024348445236682892, -0.02665846049785614, 0.0064091463573277, -0.02719639055430889, 0.11076066642999649, -0.0014569570776075125, 0.0050220787525177, 0.032427236437797546],
-        "JumpTableIndex":[-0.007416237145662308, 0.0038157713133841753, 0.05180662125349045, 0.03776901960372925, -0.011749244295060635, -0.02952706068754196, -0.06646136939525604, 0.02088487148284912, -0.001927916775457561, 0.018895410001277924, 0.0509350448846817, 0.057210080325603485, -0.0476078987121582, -0.00016809302906040102, -0.02341553010046482, -0.06734820455312729, 0.02047930844128132, 0.009282611310482025, 0.0038133300840854645, 0.0020261742174625397, -0.09253961592912674, 0.0766557827591896, -0.049570225179195404, -0.11510220915079117, -0.009570423513650894, -0.007274465169757605, 0.07750000059604645, 0.02489926479756832, -0.08297400176525116, 0.048176445066928864, 0.03797437995672226, 0.060842450708150864, 0.020265065133571625, -0.03559373319149017, 0.03493893891572952, -0.0036544676404446363, 0.010211148299276829, -0.06471849977970123, -0.034595828503370285, -0.05245388671755791, -0.0014119939878582954, 0.008752748370170593, -0.020637203007936478, 0.053244929760694504, 0.052053239196538925, 0.014706660993397236, 0.02803724631667137, -0.07983336597681046, 0.03106858767569065, 0.001688914722763002, -0.07647732645273209, -0.028148295357823372, -0.0528123639523983, 0.08006428182125092, -0.06398879736661911, -0.033476538956165314, 0.05217607319355011, -0.03093232959508896, 0.044230975210666656, 0.05123162269592285, -0.05225585401058197, 0.06976816058158875, -0.0014492797199636698, 0.03833283483982086, 0.08385992050170898, -0.04722217097878456, -0.00226160092279315, -0.027254855260252953, -0.09566919505596161, 0.02109321765601635, -0.032354824244976044, 0.08032239973545074, -0.046937450766563416, -0.004326784983277321, -0.026024870574474335, 0.12039119750261307, 0.1016048863530159, 0.06808122247457504, -0.012297546491026878, -0.06450799852609634, 0.015778351575136185, 0.012280710972845554, 0.04002666845917702, 0.04792468994855881, -0.06248988211154938, -0.054222140461206436, 0.018379682675004005, -0.0029111658222973347, 0.016062958166003227, 0.09880068898200989, 0.03846307471394539, 0.04975416138768196, 0.07305088639259338, -0.020941948518157005, -0.020897891372442245, 0.03872328996658325, -0.05682756006717682, 0.09583723545074463, 0.0028475294820964336, -0.05127262324094772],
         "LCMPXCHG":[0.0649508610367775, -0.04321656376123428, 0.08405561745166779, -0.07786691188812256, -0.05277935788035393, 0.011031142435967922, -0.0015533932019025087, 0.08730415254831314, -0.004414519295096397, 0.04040057212114334, -0.005748671013861895, -0.013907546177506447, 0.1028006374835968, 0.09900037944316864, -0.06475479900836945, 0.024365412071347237, -0.0727076306939125, 0.06610138714313507, -0.026073187589645386, 0.08258920162916183, -0.007938066497445107, 0.07641425728797913, 0.10221290588378906, 0.029036179184913635, -0.024506229907274246, 0.00953623466193676, -0.03283938392996788, -0.07194274663925171, -0.023513879626989365, -0.017550935968756676, -0.037860531359910965, 0.042062658816576004, 0.0501263290643692, 0.02325640618801117, 0.0018605751683935523, 0.012687316164374352, -0.016979143023490906, -0.059858907014131546, -0.07078705728054047, 0.033630695194005966, 0.036799900233745575, -0.03821465000510216, -0.059619177132844925, -0.06309511512517929, 0.0019384543411433697, -0.053095221519470215, 0.00571654736995697, 0.07134073972702026, -0.02115899883210659, 0.021287376061081886, -0.04855392873287201, 0.0103003466501832, -0.008993818424642086, 0.05131004378199577, -0.0734843909740448, 0.017303360626101494, 0.008291462436318398, 0.046435531228780746, -0.055057018995285034, -0.05454597249627113, -0.009126733057200909, -0.0012434959644451737, -0.0846821740269661, -0.017736544832587242, -0.04779898375272751, 0.020568806678056717, -0.061118245124816895, -0.012131555937230587, 0.024907736107707024, -0.0161012914031744, -0.011221951805055141, -0.029136324301362038, 0.04336633160710335, -0.00514700124040246, 0.004810850135982037, 0.014044326730072498, -0.07381691038608551, -0.064864382147789, 0.041784100234508514, 0.06648915261030197, 0.038817185908555984, -0.03421948850154877, 0.019546108320355415, -0.00579161336645484, -0.06579872220754623, -0.01745537295937538, -0.07164284586906433, 0.032588109374046326, 0.009170422330498695, -0.08387100696563721, -0.04743993282318115, 0.05926872417330742, 0.03129392862319946, -0.012995549477636814, 0.007799868006259203, 0.036110181361436844, 0.01603531278669834, -0.09735894203186035, 0.014374110847711563, -0.023844046518206596],
         "LD_Fp":[0.09850919246673584, 0.022097617387771606, -0.02880568616092205, 0.014175659976899624, -0.03401500731706619, -0.010281442664563656, -0.05501694604754448, -0.041856300085783005, 0.07016798853874207, -0.022585496306419373, -0.007230871357023716, 0.02143889106810093, 0.011802875436842442, -0.011940510012209415, 0.001225354615598917, -0.04420488327741623, 0.058923713862895966, 0.07726655155420303, -0.024950502440333366, -0.005545462481677532, 0.037338823080062866, -0.03718772903084755, 0.08340831100940704, 0.030300375074148178, -0.04332158342003822, -0.10117480903863907, -0.023774733766913414, 0.055412717163562775, 0.07188894599676132, 0.048699796199798584, 0.02051064558327198, -0.05177381634712219, 0.046848755329847336, 0.06421937793493271, 0.014812597073614597, 0.06599052250385284, 0.055128950625658035, 0.057206105440855026, 0.004570540506392717, 0.0006673894240520895, -0.04956628009676933, 0.018173960968852043, 0.009045585989952087, -0.09929032623767853, -0.0734606683254242, 0.009978558868169785, 0.016378602012991905, -0.0809779167175293, 0.028371425345540047, 0.07337132841348648, -0.0712965577840805, -0.07612331956624985, 0.023224541917443275, -0.01886812597513199, 0.049867402762174606, 0.04525093734264374, -0.04347287490963936, 0.04647829011082649, -0.020921878516674042, 0.055911704897880554, 0.0646883100271225, 0.043256886303424835, 0.012135359458625317, 0.06405725330114365, 0.04327752813696861, -0.06879010051488876, -0.02182726003229618, -0.030435195192694664, -0.04794333875179291, 0.03966866061091423, -0.05612926930189133, 0.061092350631952286, -0.047390542924404144, 0.06440525501966476, 0.07119303941726685, 0.036672186106443405, 0.039346762001514435, 0.05825766921043396, -0.05363740026950836, 0.026515239849686623, -0.021117106080055237, -0.061990927904844284, 0.06407181918621063, -0.02918284200131893, 0.06280291080474854, 0.05465791001915932, 0.025043612346053123, -0.015093226917088032, 0.0339696891605854, 0.039516378194093704, -0.005943501368165016, 0.037065502256155014, 0.0036617075093090534, -0.04032375290989876, -0.027956390753388405, -0.028206538408994675, 0.003602939657866955, 0.0015424611046910286, 0.03779160603880882, -0.012583530507981777],
         "LEA":[-0.07203060388565063, -0.017553633078932762, 0.0402604416012764, -0.03958871215581894, -0.035693515092134476, 0.006020952947437763, 0.06661038845777512, -0.05565638095140457, -0.07512512803077698, 0.015386131592094898, 0.1531272977590561, 0.07126382738351822, -0.018143991008400917, 0.0798688530921936, -0.0836813896894455, -0.005903773941099644, -0.03920849785208702, 0.025672506541013718, -0.017640162259340286, -0.09243063628673553, 0.0272371768951416, 0.04267166927456856, -0.032052017748355865, 0.06952647119760513, -0.03414658084511757, 0.05041181296110153, 0.04035321623086929, 0.04639449715614319, -0.000271787925157696, 0.1057962104678154, -0.031690120697021484, 0.0785541757941246, -0.008634688332676888, 0.035989925265312195, -0.00988205149769783, -0.047323428094387054, -0.018978994339704514, -0.001277003320865333, -0.022872451692819595, -0.034365635365247726, -0.04628191888332367, 0.06221615523099899, 0.01957613043487072, 0.13219280540943146, 0.03662179410457611, 0.046082716435194016, 0.011469600722193718, -0.025702660903334618, -0.08428508788347244, -0.07941769808530807, -0.06742636859416962, 0.0873873308300972, 0.0038614647928625345, 0.02177446149289608, -0.004519546404480934, -0.06213155761361122, -0.011228920891880989, -0.12034870684146881, 0.008946738205850124, 0.009164049290120602, -0.02258075587451458, 0.016061170026659966, 0.0645158663392067, 0.03723616153001785, -0.06451661139726639, -0.005219440441578627, -0.055180057883262634, 0.015841009095311165, -0.01621314138174057, -0.09887613356113434, 0.04894544556736946, -0.07996354252099991, 0.0138346366584301, -0.04036646708846092, -0.07073907554149628, -0.019294722005724907, -0.08181063830852509, -0.002301511587575078, -0.03429428115487099, 0.04098176211118698, 0.0706806555390358, 0.020024126395583153, 0.043529968708753586, 0.060017164796590805, 0.003525135340169072, -0.029752371832728386, -0.021769365295767784, 0.03941021487116814, -0.002250884659588337, -0.08078912645578384, 0.015297000296413898, 0.026888463646173477, 0.048139896243810654, -0.04837239161133766, -0.036249756813049316, -0.027615496888756752, -0.15165935456752777, -0.03756902739405632, 0.015112340450286865, -0.0010633820202201605],
@@ -89,13 +83,10 @@
         "LXADD":[-0.11344388872385025, 0.08068472892045975, -0.041796449571847916, -0.043138183653354645, -0.049067553132772446, -0.005337296053767204, 0.021436110138893127, -0.035862036049366, -0.05354782193899155, 0.007918866351246834, -0.033625587821006775, 0.048349399119615555, 0.07167208194732666, -0.04589017853140831, -0.023661522194743156, 0.03580676391720772, 0.03326055034995079, 0.041535746306180954, -0.008772681467235088, -0.03362834453582764, -0.008885134011507034, -0.005286931060254574, -0.09389151632785797, 0.015108847059309483, -0.020455803722143173, 0.06477829068899155, 0.012845957651734352, -0.03201524540781975, -0.07100234925746918, 0.046879976987838745, -0.06030888110399246, 0.022502053529024124, -0.10942362248897552, -0.06978410482406616, 0.0714743509888649, 0.057766277343034744, 0.038102924823760986, -0.007761931978166103, -0.11331900954246521, -0.07498679310083389, -0.002573479898273945, -0.005142265930771828, -0.04596858471632004, -0.05356051027774811, 0.10633396357297897, -0.07426618784666061, 0.037482988089323044, 0.10527358204126358, 0.08239476382732391, 0.0678592249751091, -0.014271541498601437, -0.010673552751541138, -0.0767236202955246, 0.0329856239259243, -0.02222914807498455, -0.0019944666419178247, -0.0789676085114479, 0.006855306681245565, -0.012843947857618332, -0.10197136551141739, -0.036981865763664246, 0.04500154033303261, 0.0023044694680720568, -0.0031417198479175568, -0.06536462903022766, -0.02773689292371273, 0.06672050058841705, 0.046953968703746796, 0.009028433822095394, -0.008872197940945625, 0.09054717421531677, 0.009121377021074295, 0.09400534629821777, 0.012045130133628845, -0.014854185283184052, 0.030989984050393105, -0.030203191563487053, 0.09275887161493301, -0.009853487834334373, 0.038435857743024826, 0.05689401552081108, -0.06919367611408234, -0.02360834926366806, -0.08338318765163422, 0.01904873177409172, -0.027271559461951256, -0.05529508367180824, 0.09507890790700912, -0.03128642588853836, 0.026687508448958397, -0.05117009952664375, -0.03872146084904671, 0.08641110360622406, -0.027542488649487495, -0.09849996864795685, 0.05740527808666229, -0.02291804924607277, -0.10829142481088638, 0.008436905220150948, 0.027438905090093613],
         "MAXSDrr":[-0.06119297072291374, -0.04124095290899277, -0.0296846404671669, -0.045824289321899414, 0.02508155070245266, 0.007925539277493954, 0.043926920741796494, -0.03159729018807411, 0.019068658351898193, -0.013711963780224323, -0.028986897319555283, -0.04561398923397064, 0.04851536825299263, -0.03764308616518974, -0.018207892775535583, 0.016173269599676132, -0.004123492166399956, -0.025343073531985283, -0.09777097404003143, 0.0290510356426239, -0.06969164311885834, -0.06684337556362152, 0.04377250373363495, 0.06861237436532974, -0.046966683119535446, 0.0611143596470356, -0.044503044337034225, 0.023559842258691788, -0.029876690357923508, 0.011016200296580791, 0.07286348938941956, 0.00030023325234651566, 0.08359035104513168, 0.017708808183670044, 0.07800529897212982, -0.08712167292833328, 0.002862636698409915, -0.06735634058713913, 0.03052128478884697, 0.04226242005825043, 0.023851098492741585, -0.04562359303236008, -0.013745550066232681, 0.013936172239482403, -0.0647776871919632, -0.0487772636115551, 0.07015536725521088, -0.030445875599980354, -0.043143901973962784, -0.09556057304143906, 0.047779254615306854, 0.046041958034038544, 0.009388554841279984, 0.04671555384993553, -0.059331271797418594, 0.03360891714692116, 0.03569460287690163, 0.004674405790865421, 0.03280949592590332, -0.011293579824268818, -0.05531742051243782, 0.045912306755781174, 0.04241438955068588, -0.07023770362138748, -0.03889290615916252, 0.019566599279642105, 0.06292827427387238, -0.012180106714367867, -0.009482266381382942, 0.0033363515976816416, -0.028241898864507675, 0.04916750639677048, -0.011430651880800724, 0.05025538429617882, 0.02134493552148342, 0.04370661824941635, 0.08801361173391342, -0.04115797579288483, -0.06421534717082977, -0.051845721900463104, -0.041304778307676315, 0.0507316067814827, 0.049301628023386, -0.013558737933635712, -0.004291698802262545, 0.038709867745637894, -0.0636303573846817, -0.047141704708337784, 0.022303685545921326, 0.07054309546947479, 0.009679436683654785, 0.0638614296913147, -0.046838339418172836, 0.01595005951821804, -0.025526082143187523, -0.0818924531340599, 0.016986405476927757, 0.023154381662607193, 0.06338698416948318, 0.07277237623929977],
         "MAXSSrr":[0.04370328411459923, 0.007435579318553209, 0.05632773041725159, 0.05872607231140137, -0.02179848775267601, -0.02491024136543274, -0.09028499573469162, -0.073136106133461, 0.0038046056870371103, -0.004702121019363403, 0.06376311928033829, 0.025374436751008034, 0.03343794494867325, -0.03841162100434303, 0.04050759971141815, 0.06359805166721344, -0.05459776520729065, -0.013898322358727455, 0.043059010058641434, 0.008913826197385788, -0.08469206839799881, 0.07041019201278687, -0.08591683208942413, 0.001833248999901116, 0.07940677553415298, -0.025694575160741806, -0.07197162508964539, -0.017312491312623024, -0.037606846541166306, -0.024861449375748634, 0.024707462638616562, -0.00026734761195257306, 0.033847302198410034, 0.05927937477827072, 0.04899705946445465, 0.0770091861486435, -0.09790053963661194, 0.057826053351163864, 0.05768071860074997, 0.01531772967427969, 0.0404951311647892, -0.04033346846699715, 0.05936214700341225, -0.029121382161974907, 0.044257547706365585, -0.10413498431444168, 0.09214437007904053, 0.017709942534565926, 0.026122651994228363, -0.08045665174722672, -0.03744427487254143, 0.09111800789833069, 0.0020880592055618763, 0.07745599746704102, 0.04109589755535126, -0.07718705385923386, -0.045550283044576645, 0.06791391223669052, 0.06261736899614334, -0.04795467481017113, 0.016496436670422554, 0.02853775955736637, -0.038986679166555405, 0.012603304348886013, 0.05299075320363045, 0.0022748250048607588, 0.00884503684937954, 0.1081618219614029, 0.05347983166575432, 0.03069908171892166, 0.015294212847948074, 0.0618034303188324, -0.07555301487445831, -0.0897526815533638, -0.07293840497732162, -0.02863491326570511, 0.01548877265304327, 0.09115951508283615, 0.011775748804211617, -0.009436656720936298, -0.07188120484352112, -0.004493236541748047, 0.0661926344037056, -0.04905804619193077, -0.06685564666986465, 0.06110713630914688, 0.018521195277571678, -0.04577818885445595, 0.07256703823804855, 0.0831693485379219, 0.008730655536055565, 0.04827301949262619, 0.0754026547074318, 0.027548737823963165, -0.07210569083690643, -0.004550515208393335, -0.06998797506093979, -0.014580612070858479, -0.04511459916830063, 0.1119980439543724],
-        "MBB":[0.0285621527582407, 0.017540860921144485, -0.08473232388496399, -0.004012782592326403, 0.01284435298293829, -0.05268647149205208, 0.05576688051223755, 0.0021535248961299658, -0.03945871442556381, -0.006189210340380669, -0.015129411593079567, -0.08998296409845352, -0.023543253540992737, -0.03973307088017464, 0.03474939242005348, -0.01602775789797306, -0.07461361587047577, -0.016514597460627556, -0.016366377472877502, 0.004728052299469709, -0.023341577500104904, -0.0914730429649353, 0.030636735260486603, -0.03425632417201996, 0.03614623472094536, -0.007019295822829008, -0.0218521635979414, -0.015808485448360443, -0.05414801836013794, 0.029721688479185104, 0.09407073259353638, 0.029655681923031807, -0.005722714588046074, 0.08653672784566879, 0.01633341796696186, -0.07890991121530533, -0.07574641704559326, 0.013483843766152859, -0.0011275253491476178, -0.05623066797852516, -0.03096684440970421, -0.0019136210903525352, 0.005127475131303072, 0.005057196598500013, -0.008401975966989994, -0.0391613207757473, -0.0026145142037421465, 0.05342942103743553, 0.034099776297807693, 0.028928104788064957, -0.006105952430516481, -0.039190810173749924, 0.026784662157297134, -0.07679374516010284, -0.007475676946341991, -0.036650288850069046, 0.00774755235761404, 0.008984091691672802, -0.059830714017152786, 0.042310964316129684, 0.0681624785065651, -0.018189340829849243, -0.014816401526331902, -0.05541539564728737, -0.09348370134830475, 0.003691869555041194, -0.0010735570685938, -0.010131723247468472, -0.041050590574741364, -0.013792471028864384, -0.024337435141205788, 0.07526508718729019, 0.08163300901651382, -0.03508464992046356, -0.01681988686323166, -0.06734774261713028, -0.07656992971897125, -0.03866373747587204, 0.004544078838080168, 0.0585801787674427, -0.021823249757289886, -0.0610244981944561, -0.04469957575201988, -0.011089849285781384, -0.05069964751601219, -0.025694409385323524, -0.0670132040977478, 0.09616350382566452, 0.06308142840862274, -0.10543308407068253, 0.0023751568514853716, -0.06237253174185753, 0.05771911144256592, -0.06010056659579277, -0.016188565641641617, 0.009142348542809486, -0.014255198650062084, -0.02999819628894329, 0.00473234336823225, 0.03976761922240257],
-        "MCSymbol":[0.05158298835158348, 0.05024643987417221, 0.06704410910606384, 0.0378347709774971, -0.03902719169855118, -0.08626251667737961, 0.03964311257004738, 0.06615762412548065, 0.04361319541931152, 0.03646374121308327, -0.018487416207790375, 0.0024993624538183212, 0.006693041883409023, 0.08311881870031357, 0.021111667156219482, 0.038208797574043274, 0.08689694851636887, -0.03659898787736893, 0.020775076001882553, 0.03553535416722298, 0.06854367256164551, -0.002012243028730154, 0.03658154606819153, 0.03127564862370491, 0.0363621786236763, -0.027205800637602806, -0.05243372917175293, 0.012564878910779953, -0.013430594466626644, -0.04043225944042206, -0.025083716958761215, 0.09665156900882721, 0.005077417939901352, -0.05181048810482025, 0.08925056457519531, 0.0777667909860611, -0.013708796352148056, 0.07754126191139221, 0.08393577486276627, 0.06395212560892105, -0.07428556680679321, -0.052424050867557526, 0.03497577831149101, 0.01964585855603218, -0.0429445318877697, 0.07072066515684128, 0.0017074055504053831, 0.059513408690690994, 0.013262910768389702, -0.07240563631057739, 0.09288764744997025, 0.030620144680142403, -0.046197980642318726, 0.04847298562526703, -0.03942957893013954, -0.0025783153250813484, -0.019526517018675804, 0.038867682218551636, 0.006007499527186155, -0.06366054713726044, 0.004640159662812948, 0.013837787322700024, -0.020015377551317215, -0.010317903012037277, 0.001741019543260336, 0.06261103600263596, -0.03374830260872841, 0.01629183441400528, -0.013137640431523323, 0.026046304032206535, -0.009679407812654972, -0.07085473090410233, 0.03035539574921131, -0.08764562010765076, -0.03820766881108284, -0.04181021824479103, -0.05163294076919556, 0.06666433811187744, -0.08939782530069351, 0.040260378271341324, -0.06847432255744934, 0.09106951206922531, -0.07388591021299362, -0.07479099184274673, -0.001779694459401071, -0.0963745042681694, -0.06515862792730331, -0.08404017239809036, -0.09935544431209564, 0.010541093535721302, -0.04491754248738289, 0.09378639608621597, 0.006655062548816204, 0.06637217849493027, -0.05623293295502663, -0.020134123042225838, 0.005873391404747963, -0.07765494287014008, -0.0008442706312052906, -0.03568055108189583],
         "MINSDrr":[0.00284420233219862, 0.07673676311969757, 0.08602232486009598, 0.030074521899223328, -0.06255929172039032, -0.10135219246149063, 0.0772649347782135, 0.0045582992024719715, -0.01195931900292635, 0.009085145778954029, -0.04665979743003845, 0.019213048741221428, 0.022454556077718735, -0.05505772680044174, 0.035268958657979965, -0.06431140005588531, -0.001450810581445694, -0.027346337214112282, 0.041191086173057556, -0.0808955729007721, -0.04748200997710228, 0.0653977245092392, 0.042980875819921494, -0.04332194849848747, -0.024661004543304443, 0.09317019581794739, -0.06639514118432999, 0.013383567333221436, 0.051771167665719986, 0.05815904587507248, -0.05226780101656914, 0.079694002866745, -0.017969269305467606, -0.07137028127908707, -0.0011493286583572626, -0.02009846828877926, 0.006549016106873751, 0.0019126685801893473, 0.06168307736515999, -0.025323089212179184, 0.010943768545985222, 0.02157585136592388, -0.012993190437555313, -0.025179127231240273, -0.08958654850721359, -0.04273540899157524, 0.015248515643179417, 0.05456075817346573, 0.05705633386969566, -0.0038763433694839478, 0.08008016645908356, -0.004114328417927027, -0.01975642889738083, -0.014040309935808182, 0.025527596473693848, -0.06883629411458969, 0.06273050606250763, 0.05779215693473816, -0.061573851853609085, 0.01889919489622116, 0.026195447891950607, -0.021544434130191803, -0.0810774490237236, -0.016286203637719154, 0.01799311302602291, -0.08440321683883667, 0.0897485539317131, 0.08083964139223099, -0.006629236973822117, 0.051063962280750275, -0.08597207814455032, 0.029692046344280243, -0.03309508413076401, -0.09422174096107483, 0.0019163102842867374, 0.05546015128493309, -0.05980079993605614, -0.07416199892759323, -0.005134278908371925, 0.07392455637454987, -0.0634748563170433, 0.020546387881040573, -0.019978882744908333, 0.039572179317474365, -0.04754075035452843, -0.06090293824672699, -0.011185224168002605, -0.054661743342876434, 0.027916360646486282, -0.00819246843457222, -0.03119322657585144, 0.019949961453676224, 0.008312772959470749, 0.06788603216409683, 0.041624777019023895, 0.051687415689229965, -0.04819793254137039, -0.0761520192027092, -0.019374510273337364, -0.008435340598225594],
         "MINSSrr":[-0.06906168162822723, 0.008121289312839508, 0.010413543321192265, 0.052863992750644684, 0.01030051801353693, -0.009280139580368996, 0.016139337792992592, -0.05126945674419403, 0.06733083724975586, -0.01006366591900587, 0.06506948918104172, 0.05012301355600357, -0.07191506028175354, 0.018038516864180565, -0.020798280835151672, 0.08538958430290222, -0.028427604585886, 0.02630189247429371, 0.010489841923117638, 0.10011959075927734, -0.067482590675354, 0.01461686473339796, 0.03908747434616089, -0.015383233316242695, -0.03783239424228668, 0.06359098851680756, -0.052475571632385254, 0.07818790525197983, -0.0030931381043046713, 0.013684416189789772, 0.04222726821899414, 0.04708671569824219, 0.01192860770970583, 0.08628913760185242, -0.06380248814821243, -0.004006511997431517, -0.02817981317639351, -0.11196613311767578, 0.01953534409403801, 0.0034300305414944887, -0.040240559726953506, 0.004963779356330633, -0.06623393297195435, -0.04386508837342262, -0.08431598544120789, -0.023293999955058098, 0.02133636176586151, 0.04054516181349754, 0.04479363188147545, 0.02776535600423813, 0.01497643906623125, 0.026148531585931778, -0.05869835242629051, -0.07451415807008743, -0.009552933275699615, -0.004124804865568876, 0.08342882245779037, 0.05295371264219284, -0.05495591461658478, -0.07350015640258789, -0.05573306977748871, 0.07158630341291428, 0.04162517935037613, 0.0019162269309163094, -0.07742705941200256, -0.05673951655626297, 0.05760834366083145, 0.08143799751996994, 0.09629082679748535, -0.05737840384244919, 0.03762679174542427, 0.022383252158761024, 0.02897579036653042, -0.0929567888379097, 0.04767351970076561, 0.05145186930894852, 0.012956425547599792, 0.04237693175673485, 0.06772835552692413, 0.011290902271866798, -0.06324069201946259, -0.04689439386129379, 0.09521757066249847, 0.05625065788626671, -0.032533977180719376, -0.00987032800912857, -0.08346299827098846, -0.06292857229709625, 0.042861636728048325, 0.08865208923816681, -0.0021774298511445522, 0.010668188333511353, -0.05791740491986275, 0.02240762859582901, -0.022414017468690872, 0.04343479871749878, 0.01852354407310486, -0.004329795949161053, -0.00262851663865149, -0.009029376320540905],
         "MOV":[-0.03924819082021713, -0.015029003843665123, 0.14121688902378082, -0.05414531007409096, -0.01409768033772707, 0.05467522144317627, -0.0798286497592926, 0.042834796011447906, -0.04328306391835213, -0.12638653814792633, 0.02380293421447277, -0.010002975352108479, -0.03018246777355671, -0.09843093156814575, -0.015159506350755692, -0.03186051547527313, -0.009830419905483723, 0.024049948900938034, -0.028536750003695488, -0.05252794921398163, -0.003984724637120962, -0.09075328707695007, -0.015937313437461853, 0.07316069304943085, 0.002778300317004323, 0.003214895725250244, -0.0832214206457138, 0.012602301314473152, 0.0687694102525711, 0.1425037384033203, -0.04724106192588806, 0.05618143081665039, 0.0028424363117665052, 0.03067261539399624, 0.008477674797177315, -0.002142940880730748, 0.0036045191809535027, -0.02257452718913555, 0.013552851043641567, -0.016065331175923347, 0.03364546224474907, 0.0027604023925960064, -0.013575572520494461, 0.1340155154466629, 0.04859570413827896, 0.07984673976898193, 0.006813493091613054, -0.017625009641051292, -0.0667564794421196, -0.0025298972614109516, -0.06280945241451263, 0.08589767664670944, -0.011751428246498108, 0.04074618220329285, 0.0561428964138031, -0.0068444423377513885, 0.028041694313287735, 0.06258948892354965, 0.02493610419332981, -0.018480388447642326, -0.035079196095466614, 0.14365622401237488, -0.046609606593847275, 0.040164150297641754, -0.049927353858947754, 0.06781942397356033, -0.04828719049692154, 0.03496144339442253, -0.044686879962682724, 0.04254060238599777, 0.024320241063833237, -0.0031205937266349792, -0.049061503261327744, -0.028716804459691048, -0.056192029267549515, 0.022012677043676376, -0.0745186060667038, -0.0008951064082793891, -0.051033493131399155, 0.023357892408967018, 0.06984421610832214, 0.0057564410381019115, -0.005192344542592764, -0.003961252048611641, -0.012275456450879574, -0.018581852316856384, -0.0046620736829936504, 0.02494811825454235, 0.0520334355533123, -0.02435225434601307, 0.0008846594137139618, 0.017687007784843445, 0.07866063714027405, -0.025595100596547127, -0.020679078996181488, -0.027750879526138306, 0.10005537420511246, -0.015581297688186169, -0.08011393249034882, 0.028118811547756195],
         "MUL":[-0.026987887918949127, 0.06016572564840317, 0.0787728950381279, -0.0803905576467514, 0.005736608523875475, -0.07245960086584091, -0.02662983350455761, 0.012340782210230827, 0.042490337044000626, 0.06399581581354141, -0.009004191495478153, 0.0370473749935627, -0.0605553574860096, -0.09520823508501053, 0.0010566662531346083, -0.028270091861486435, 0.08631408214569092, 0.002891023177653551, -0.051674507558345795, -0.04089691862463951, -0.04444378614425659, -0.061945777386426926, -0.026001833379268646, 0.04689744487404823, -0.07711070775985718, 0.07018855959177017, -0.02606336772441864, 0.054914504289627075, 0.03522270917892456, -0.027317974716424942, 0.02187947928905487, 0.009710998274385929, 0.01340037677437067, 0.016422593966126442, -0.058249425143003464, -0.08377814292907715, -0.04476138949394226, 0.04349169507622719, 0.05062006786465645, 0.01706511154770851, 0.020649245008826256, 0.06287672370672226, -0.03981941193342209, 0.04973218962550163, -0.03353424742817879, -0.016799092292785645, -0.031751759350299835, 0.10430201143026352, -0.04326871410012245, 0.0736854076385498, -0.0768580436706543, -0.03183818608522415, 0.010583195835351944, 0.015541432425379753, 0.03191666305065155, 0.020011236891150475, 0.041239380836486816, -0.0029152908828109503, 0.009499716572463512, -0.011166329495608807, 0.03469998389482498, 0.00607832008972764, 0.030300112441182137, -0.040855471044778824, 0.00988304428756237, 0.050531189888715744, 0.06647889316082001, -0.027519647032022476, -0.06819992512464523, 0.02215251699090004, 0.086424820125103, -0.03395787626504898, -0.020825445652008057, 0.08309803158044815, -0.0256529338657856, 0.005000723991543055, -0.03375622257590294, 0.005569287110120058, -0.028089171275496483, 0.04142652079463005, -0.03232670575380325, 0.025872791185975075, -0.07439207285642624, 0.04975134879350662, 0.049770113080739975, -0.05090470612049103, -0.04476647078990936, 0.09217675030231476, 0.05079415813088417, 0.017867455258965492, -0.04477125406265259, 0.004301204811781645, 0.05066722631454468, -0.08186711370944977, 0.008772231638431549, -0.10532139241695404, 0.004499110858887434, 0.03296274691820145, -0.0020684772171080112, 0.05012065917253494],
-        "Metadata":[-0.07879140228033066, 0.024690961465239525, 0.022790303453803062, 0.01354144886136055, -0.07098772376775742, 0.04053819552063942, -0.04038544371724129, -0.021055836230516434, 0.10361373424530029, 0.04415135458111763, -0.09545262902975082, 0.042553599923849106, -0.021835647523403168, 0.07703430950641632, -0.04880501329898834, -0.04054124280810356, 0.05049756169319153, 0.08986796438694, 0.0705084353685379, -0.0077315340749919415, -0.045390889048576355, 0.053155045956373215, 0.045656319707632065, -0.02663712576031685, -0.01446426473557949, -0.058978915214538574, 0.011314704082906246, 0.03043927252292633, -0.0843580812215805, 0.017854437232017517, -0.08720997720956802, 0.030351335182785988, -0.04896129295229912, 0.04189978539943695, -0.09887325763702393, 0.0015409664483740926, -0.08604399859905243, 0.10654544085264206, 0.1058540865778923, 0.014106648042798042, 0.0640459656715393, -0.05182884633541107, 0.006081609521061182, 0.07624028623104095, 0.02025698497891426, 0.08467324078083038, 0.027136018499732018, 0.026320911943912506, -0.035337720066308975, 0.03864980861544609, -0.019960917532444, -0.029152821749448776, 0.06562864780426025, 0.028298277407884598, -0.07397148013114929, -0.005078969523310661, 0.025909438729286194, -0.01157586183398962, 0.05436081811785698, 0.03408071771264076, -0.07142144441604614, -0.0523630827665329, -0.06302442401647568, -0.019975490868091583, -0.06937523931264877, 0.057667043060064316, -0.08580337464809418, -0.05092239752411842, -0.012613813392817974, 0.025480754673480988, 0.04219530522823334, -0.007300581783056259, 0.05323299020528793, 0.0489904023706913, 0.09260626882314682, -0.04819458723068237, 0.05419271066784859, 0.04558999091386795, 0.012036344967782497, -0.05483977124094963, -0.05181310698390007, -0.02104383148252964, -0.057876624166965485, 0.039601441472768784, 0.025240536779165268, -0.03984035924077034, 0.07654847204685211, -0.07073183357715607, -0.0018080074805766344, -0.016453349962830544, 0.03962434455752373, 0.05717255175113678, 0.01962372660636902, 0.00952839944511652, 0.0013127806596457958, 0.013634574599564075, 0.07692103832960129, 0.06334574520587921, 0.056647684425115585, -0.02965259924530983],
         "NEG":[-0.0585959330201149, -0.02519698068499565, 0.029133861884474754, -0.003332944354042411, 0.05054186284542084, -0.03572014719247818, -0.012210451066493988, 0.06708117574453354, -0.0712793841958046, -0.01644597202539444, 0.06453811377286911, -0.03662518784403801, 0.0545802004635334, -0.11130833625793457, -0.04544609412550926, 0.012950814329087734, 0.08011337369680405, 0.014672964811325073, 0.0030391360633075237, -0.10994786024093628, 0.004102041013538837, -0.0749390497803688, -0.010000540874898434, 0.062072113156318665, 0.03312767669558525, -0.04764379560947418, -0.033307697623968124, 0.02903047949075699, 0.0319744311273098, 0.027374137192964554, -0.05640692263841629, -0.01572772115468979, 0.019634589552879333, 0.0629790723323822, -0.024743184447288513, -0.09348101913928986, 0.04078087955713272, 0.0002063393039861694, 0.01791796088218689, -0.01174850668758154, 0.0067609078250825405, 0.031922854483127594, 0.045338794589042664, -0.06706424057483673, -0.03090975433588028, 0.035511564463377, 0.0377444289624691, -0.007464382331818342, 0.02387971244752407, -0.023001981899142265, -0.0052301278337836266, 0.08532170951366425, 0.00384823651984334, 0.0689602717757225, -0.05606595426797867, 0.03483026847243309, 0.023350417613983154, -0.06512849777936935, 0.0627395287156105, -0.0203714482486248, -0.009735504165291786, 0.06432165950536728, -0.04546240717172623, 0.0322086475789547, 0.004561635199934244, 0.040702879428863525, -0.0680280476808548, 0.025354159995913506, -0.07624178379774094, 0.06776861846446991, 0.07863514125347137, -0.037652503699064255, -0.023264721035957336, 0.030604641884565353, -0.07419195026159286, 0.014679630286991596, 0.1294829547405243, 0.007591600529849529, -0.06612348556518555, 0.03127516806125641, 0.10645392537117004, -0.018773522228002548, 0.03992835432291031, 0.044048961251974106, 0.00023814172891434282, -0.06797933578491211, -0.08000202476978302, -0.04320430010557175, 0.043590281158685684, -0.05034546181559563, 0.014501169323921204, 0.03329288214445114, 0.03045976720750332, -0.01932660862803459, -0.026188183575868607, -0.1232738122344017, -0.04858024790883064, -0.015570580027997494, 0.013346930965781212, 0.009410912171006203],
         "NOT":[0.02556992694735527, -0.0005189123330637813, 0.010195978917181492, -0.027382172644138336, -0.0374554842710495, 0.08793098479509354, 0.0024311996530741453, -0.08769379556179047, -0.054654307663440704, -0.08747632801532745, 0.09218847006559372, 0.0972878560423851, 0.044738128781318665, -0.02398994378745556, -0.046165600419044495, -0.0002692296984605491, -0.03797682002186775, 0.05161413550376892, -0.033769138157367706, 0.011279402300715446, 0.08941229432821274, -0.07437314093112946, -0.025249861180782318, 0.1026485413312912, -0.042062994092702866, 0.022835882380604744, 0.05108749121427536, -0.054616689682006836, -0.04208545386791229, 0.10205414891242981, -0.02474227361381054, -0.01605238951742649, -0.011079655028879642, -0.04231556877493858, -0.058844879269599915, 0.0017704797210171819, 0.005396600812673569, -0.058835554867982864, 0.03384264558553696, -0.024245088919997215, 0.03355555981397629, 0.02017929218709469, 0.04421762749552727, 0.09027500450611115, 0.03916880115866661, 0.042518291622400284, 0.024490609765052795, 0.00026937652728520334, -0.010342003777623177, -0.05488119646906853, 0.07418034970760345, 0.0008032438345253468, 0.09190968424081802, 0.07747997343540192, -0.024773627519607544, 0.0496656633913517, -0.038326963782310486, -0.0022213482297956944, 0.02448110282421112, 0.0022990668658167124, 0.052763812243938446, 0.051123637706041336, 0.03795074671506882, 0.06734737008810043, -0.030445149168372154, 0.021410485729575157, -0.044919464737176895, -0.0011586989276111126, -0.0903671532869339, -0.01408425159752369, 0.07342954725027084, -0.04118982329964638, -0.008432484231889248, -0.0008165669860318303, -0.0642886608839035, 0.007230957038700581, -0.0670868456363678, -0.01116579957306385, -0.09545603394508362, -0.03109285980463028, 0.005951744969934225, 0.024672016501426697, -0.04027184471487999, 0.03607063740491867, 0.023179687559604645, 0.0117312828078866, -0.019768331199884415, -0.023262612521648407, 0.04165903106331825, -0.039224691689014435, 0.040571704506874084, 0.08653629571199417, 0.027772698551416397, -0.08196783810853958, -0.013821743428707123, 0.004212009254842997, 0.01664070598781109, -0.008459849283099174, 0.041462354362010956, 0.06886350363492966],
         "OR":[-0.0010318798013031483, -0.058885037899017334, 0.015562368556857109, -0.03459857404232025, -0.006239954382181168, 0.04347813501954079, -0.043183062225580215, -0.06115246191620827, -0.08097145706415176, -0.040188197046518326, 0.02098822593688965, -0.013338722288608551, -0.01845080405473709, -0.07172099500894547, -0.00026761949993669987, 0.015059647150337696, -0.08275016397237778, 0.10280061513185501, -0.017712965607643127, -0.07511771470308304, 0.007648291997611523, -0.12827979028224945, -0.020353827625513077, 0.08809063583612442, -0.02829514630138874, 0.003038457129150629, -0.04399721696972847, 0.046383049339056015, 0.06416497379541397, -0.0006932668038643897, -0.033501505851745605, -0.012374987825751305, 0.018504725769162178, 0.00529597420245409, -0.040804456919431686, -0.00419827364385128, -0.017476536333560944, -0.04530858248472214, 0.01608600653707981, -0.08898036181926727, -0.015132613480091095, -0.053797122091054916, -0.011825251393020153, 0.09507828205823898, 0.08454664051532745, 0.04075947031378746, 0.020354142412543297, 0.01704799383878708, -0.026439497247338295, -0.04004717990756035, -0.053405825048685074, 0.04079057276248932, 0.026150185614824295, 0.04538597911596298, 0.046778932213783264, 0.057205770164728165, 0.037173718214035034, -0.07114585489034653, 0.03480122983455658, 0.0069038826040923595, -0.056386105716228485, -0.03294815868139267, 0.04636325314640999, -0.05767818167805672, -0.05788124352693558, -0.011048000305891037, -0.04350278526544571, 0.029680529609322548, -0.0512658953666687, 0.04321866109967232, 0.047014784067869186, -0.014913392253220081, -0.007425297982990742, -0.09810416400432587, -0.07316632568836212, 0.05063875392079353, -0.07298189401626587, -0.012434680946171284, -0.09386061877012253, 0.016765601933002472, 0.06658460199832916, 0.0014198448043316603, -0.022241152822971344, 0.05902376398444176, 0.057584285736083984, 0.024565961211919785, -0.02896188013255596, 0.006485136691480875, 0.05981580168008804, -0.015995489433407784, 0.027470067143440247, 0.09679803997278214, 0.0342426523566246, -0.08387557417154312, -0.015599220991134644, -0.0049544889479875565, -0.06524655222892761, 0.02150602824985981, 0.016511479392647743, 0.055177561938762665],
@@ -115,7 +106,6 @@
         "PCMPGTBrr":[-0.04665364325046539, -0.03588206693530083, 0.05219453573226929, 0.08376432955265045, 0.05562759190797806, -0.0034289404284209013, 0.08200010657310486, 0.023898538202047348, -0.002851601457223296, -0.08778133243322372, 0.017107484862208366, 0.08448091894388199, 0.020043527707457542, 0.038858626037836075, 0.036468397825956345, -0.0069902255199849606, -0.09442859143018723, 0.0018075992120429873, 0.05577728524804115, -0.0005804274696856737, 0.029588190838694572, -0.050955869257450104, 0.016604335978627205, -0.054141607135534286, -0.030936168506741524, 0.004688458051532507, -0.02321118488907814, -0.009524177759885788, 0.030161075294017792, -0.0557246096432209, 0.017830688506364822, 0.04058525711297989, 0.023080267012119293, 0.04536818340420723, 0.09658516198396683, 0.004083207808434963, 0.053284309804439545, 0.07114734500646591, 0.03272407501935959, -0.06646303087472916, 0.08200454711914062, -0.06558514386415482, 0.0745493471622467, -0.0010506648104637861, -0.02250707894563675, 0.015057512558996677, -5.047186277806759e-06, 0.04663649946451187, 0.06489380449056625, -0.0477377213537693, -0.08882559835910797, 0.08948437124490738, -0.052260447293519974, 0.06798093020915985, -0.06404604762792587, 0.0005905702710151672, 0.014312930405139923, 0.0370929092168808, 0.03622571751475334, 0.06601805984973907, 0.04077596217393875, -0.0019877473823726177, -0.02357509359717369, 0.04524341970682144, 0.024309739470481873, -0.05969798564910889, -0.015872884541749954, -0.055400021374225616, 0.04820183292031288, 0.024034500122070312, -0.05125486105680466, 0.020366262644529343, 0.03310052305459976, 0.1036759540438652, 0.049202825874090195, -0.010945710353553295, -0.030628688633441925, 0.048871662467718124, 0.07457619905471802, 0.017111260443925858, 0.028184816241264343, -0.09065181016921997, -0.017116032540798187, -0.06233282387256622, -0.011385255493223667, -0.06190027296543121, -0.01189250499010086, -0.03632708638906479, 0.04705822467803955, 0.0022293981164693832, 0.06782552599906921, -0.0490303635597229, -0.08690774440765381, -0.08311695605516434, 0.04079030826687813, 0.022971853613853455, -0.019726071506738663, -0.032829709351062775, -0.05147984251379967, -0.06768873333930969],
         "PCMPGTDrr":[0.026095403358340263, 0.009877854026854038, -0.022390423342585564, -0.06749505549669266, 0.03866114094853401, 0.07523459941148758, -0.02331429533660412, -0.013958744704723358, -0.05151516944169998, -0.033018071204423904, -0.017118683084845543, 0.06611985713243484, 0.024562569335103035, 0.027193237096071243, -0.04081164300441742, -0.0557839497923851, 0.07676059752702713, -0.017435213550925255, -0.0696197971701622, 0.04529204219579697, 0.015718640759587288, -0.0868423655629158, -0.025476763024926186, 0.1075882539153099, 0.08407340198755264, 0.03219793736934662, -0.029079284518957138, -0.10067792236804962, -0.01665782555937767, -0.002518820110708475, 0.06302576512098312, -0.042360853403806686, -0.014688530936837196, -0.04797102138400078, -0.05708448588848114, 0.05345156416296959, -0.03360274061560631, -0.006362707354128361, 0.045909661799669266, -0.0034944594372063875, -0.04771789163351059, -0.015326191671192646, -0.017800530418753624, 0.009678518399596214, -0.01412744726985693, 0.09620117396116257, 0.0705861821770668, -0.0663042888045311, 0.07589521259069443, -0.08846025168895721, 0.008178732357919216, -0.023293234407901764, 0.049390021711587906, 0.00771696399897337, -0.026583032682538033, 0.012981866486370564, -0.06098538264632225, -0.04784953594207764, -0.001411060569807887, -0.0646580159664154, 0.07771933078765869, 0.012061100453138351, 0.026251494884490967, 0.024035189300775528, 0.00368816708214581, 0.019370727241039276, 0.0473535880446434, 0.0688827782869339, -0.0656280517578125, 0.0001225982268806547, -0.04765431582927704, 0.08570858836174011, 0.06544618308544159, 0.02309294231235981, -0.07891835272312164, 0.05969972908496857, 0.04259306937456131, -0.0388357900083065, 0.10700955986976624, -0.03643207252025604, -0.014097973704338074, 0.018475063145160675, -0.008959461003541946, -0.04132810980081558, -0.01586003415286541, -0.013873838819563389, 0.07354859262704849, 0.003967848140746355, -0.023853322491049767, -0.013099947944283485, 0.06407736241817474, 0.03060499019920826, -0.08859552443027496, 0.009045977145433426, -0.09939071536064148, -0.022137949243187904, -0.03951180726289749, -0.0316530205309391, 0.05501912534236908, -0.06330689787864685],
         "PEXTRWrr":[-0.05698293820023537, -0.02332535944879055, -0.01313185878098011, 0.08844685554504395, -0.030702419579029083, -0.042257267981767654, -0.06976033002138138, 0.08907881379127502, 0.040486857295036316, 0.01966431364417076, 0.011261478997766972, 0.011022844351828098, -0.0069642444141209126, -0.016230706125497818, -0.009695738554000854, -0.04666578397154808, 0.016855308786034584, -0.03308985382318497, 0.01504850760102272, 0.09940154105424881, -0.07109691947698593, 0.043378811329603195, -0.06964893639087677, -0.05999808758497238, 0.008651218377053738, 0.04237857088446617, 0.04557272046804428, 0.04033806174993515, -0.005760873202234507, 0.008976156823337078, 0.05276636406779289, -0.06584233790636063, -0.011512805707752705, -0.01598522625863552, -0.044132646173238754, -0.020889364182949066, -0.09435509145259857, -0.02823605202138424, 0.0820322185754776, -0.0391690619289875, 0.03367430716753006, -0.029474111273884773, -0.07719384133815765, 0.003098628716543317, 0.05822441354393959, -0.09175454080104828, 0.02256210707128048, -0.004901964217424393, -0.008566503413021564, 0.040359016507864, -0.04049991816282272, 0.010366388596594334, -0.05293237417936325, -0.0956558957695961, -0.01418458204716444, 0.05464276298880577, -0.014091472141444683, 0.023551519960165024, -0.042662639170885086, -0.07025191932916641, -0.0017952515045180917, 0.07680258899927139, -0.10743812471628189, -0.08435508608818054, -0.00337960640899837, -0.03381747379899025, 0.027066459879279137, -0.009784750640392303, 0.04265652969479561, 0.02066781371831894, -0.03692338988184929, 0.0029027678538113832, 0.06893923878669739, 0.03784753382205963, -0.04037536308169365, -0.09532847255468369, 0.03193795308470726, 0.0387917198240757, 0.03887058049440384, -0.0002478501701261848, -0.0671166405081749, 0.06754262745380402, 0.01643708348274231, 0.012460017576813698, 0.03147564455866814, 0.05646798014640808, 0.014081758446991444, 0.07141963392496109, 0.016428180038928986, 0.0443485863506794, 0.06492826342582703, 0.09964785724878311, 0.026795320212841034, 0.0271765124052763, -0.015695465728640556, -0.08133535832166672, -0.05439477041363716, 0.04913243651390076, 0.024485180154442787, -0.04072758927941322],
-        "PHY_REG":[-0.008169060572981834, -0.017023155465722084, -0.04927198588848114, 0.0014261528849601746, 0.012259463779628277, -0.02794509381055832, -0.024857040494680405, 0.029203711077570915, 0.0433109886944294, 0.009679347276687622, -0.05811547115445137, -0.09075025469064713, -0.08525611460208893, -0.10545054078102112, 0.06474080681800842, 0.056396666914224625, 0.06781823933124542, 0.09059076011180878, -0.10420752316713333, -0.08284831047058105, 0.02349182404577732, -0.0354253351688385, -0.004627702757716179, 0.0068538435734808445, -0.053724177181720734, -0.02113335393369198, 0.05254676192998886, -0.050769440829753876, 0.061386119574308395, -0.07541731745004654, -0.024204161018133163, -0.0009893826209008694, -0.007493770215660334, -0.017051052302122116, 0.015025814063847065, -0.020427946001291275, -0.0844966471195221, 0.04589429497718811, 0.025571472942829132, -0.05280151963233948, 0.06895384937524796, 0.03960262984037399, 0.0068003153428435326, 0.09397424012422562, -0.0523529127240181, 0.03780638054013252, -0.015423302538692951, 0.029167350381612778, 0.01019437238574028, 0.023989612236618996, -0.03344425559043884, -0.07926471531391144, -0.09238854795694351, 0.04794330149888992, 0.01872367039322853, -0.029179377481341362, -0.05339968949556351, -0.04575541242957115, -0.004491546656936407, -0.009650425054132938, 0.026945313438773155, -0.02115861512720585, 0.06488905847072601, -0.06647083908319473, 0.008904196321964264, 0.010536684654653072, -0.06012551859021187, -0.00022655133216176182, -0.10175421833992004, 0.062001921236515045, -0.054452817887067795, 0.01785552129149437, -0.06749527156352997, -0.04883178323507309, -0.023449009284377098, 0.040745027363300323, 0.002448269398882985, 0.07842953503131866, -0.019806355237960815, -0.08275315910577774, 0.01131721492856741, 0.0482926219701767, 0.01892486959695816, 0.005685009527951479, -0.0055344682186841965, -0.0034555341117084026, -0.07923021167516708, 0.06387833505868912, 0.05978211387991905, -0.001252106623724103, 0.07216084003448486, -0.01223798282444477, 0.09716741740703583, 0.009659498929977417, -0.09404221922159195, -0.10122949630022049, -0.003581057768315077, 0.07885389029979706, 0.05305042862892151, -0.04988719895482063],
         "PMOVMSKBrr":[0.07294902205467224, -0.00040799094131216407, -0.01483855675905943, -0.02571418508887291, 0.08466307818889618, -0.03447218984365463, -0.05685977265238762, -0.019133185967803, 0.06332023441791534, -0.061352625489234924, -0.023195402696728706, -0.05378473922610283, -0.05650350823998451, 0.06583224982023239, -0.012845925986766815, -0.052972156554460526, 0.049470845609903336, -0.04565730318427086, 0.09717552363872528, -0.014171762391924858, 0.013508875854313374, 0.004057068843394518, -0.020556267350912094, -0.10475417971611023, 0.018426941707730293, -0.07273723930120468, 0.01702595315873623, -0.013097747229039669, -0.07530277967453003, 0.05442536994814873, -0.0601920410990715, -0.05255919322371483, -0.07305102050304413, 0.02758030779659748, 0.06180129200220108, 0.10606050491333008, 0.046477098017930984, -0.024062691256403923, 0.07360008358955383, -0.011283098720014095, -0.03712400794029236, -0.09973011910915375, 0.018314119428396225, 0.009135990403592587, -0.01891133189201355, 0.00915572326630354, 0.006080301944166422, -0.02368554100394249, -0.019582828506827354, 0.051494162529706955, -0.010953089222311974, 0.011621126905083656, 0.010515356436371803, 0.011188569478690624, -0.0202876515686512, 0.038686931133270264, -0.066365085542202, 0.014182188548147678, 0.00445093447342515, 0.05712618678808212, -0.04463819041848183, -0.10292281210422516, -0.011173201724886894, 0.0029098563827574253, 0.06890314072370529, 0.06398330628871918, 0.03248615562915802, -0.05457807704806328, -0.006898659747093916, 0.038892313838005066, -0.09130232781171799, 0.013324378058314323, -0.033766016364097595, -0.043404608964920044, 0.018701359629631042, -0.03784232959151268, -0.05014420300722122, 0.04404780641198158, 0.09254389256238937, 0.09839074313640594, -0.028214668855071068, 0.03262662887573242, 0.04281335324048996, 0.07356158643960953, -0.0773080587387085, 0.026536725461483, -0.06819723546504974, 0.03335537016391754, 0.09355103969573975, -0.052649617195129395, -0.08467497676610947, -0.06516479700803757, -0.07499512284994125, 0.023276200518012047, -0.06063856557011604, -0.044472258538007736, 0.03155883774161339, -0.011262890882790089, 0.04045895114541054, 0.012343645095825195],
         "PMULUDQrr":[-0.018331514671444893, 0.04249238595366478, 0.0718526765704155, 0.03221653401851654, -0.04829120263457298, -0.02055567130446434, 0.05200991779565811, -0.04337913170456886, -0.02698952704668045, 0.05037892237305641, 0.014545431360602379, 0.09035851061344147, 0.0777752548456192, -0.06762461364269257, 0.032133519649505615, 0.048851024359464645, 0.01295433659106493, 0.054136257618665695, 0.09599477052688599, 0.024489495903253555, 0.05683024227619171, -0.05242127552628517, -0.043476004153490067, 0.004586773458868265, 0.024281315505504608, 0.03402777388691902, 0.0033939755521714687, 0.049474406987428665, 0.0011405921541154385, 0.06828528642654419, 0.08426304161548615, -0.029339993372559547, -0.04173621907830238, -0.03966334089636803, -0.03011258877813816, -0.07684683799743652, 0.040944185107946396, -0.04709877818822861, 0.07968004047870636, 0.07534269988536835, -0.006957313045859337, -0.0016522067598998547, -0.017229178920388222, 0.030470186844468117, 0.05390452966094017, 0.05233803763985634, 0.045554302632808685, -0.03710555285215378, 0.05699322372674942, 0.019888387992978096, 0.10152119398117065, 0.026563912630081177, -0.0018862299621105194, -0.02453959546983242, -0.06107368320226669, -0.04910692200064659, -0.06316373497247696, 0.04648333042860031, -0.00939352996647358, 0.030374331399798393, 0.0027768383733928204, 0.07302171736955643, -0.0035402378998696804, 0.054474033415317535, -0.0739617869257927, 0.01190911140292883, -0.019428657367825508, -0.006644500885158777, -0.04998863860964775, 0.03215506672859192, 0.054085105657577515, 0.047874726355075836, 0.10735851526260376, 0.030255280435085297, 0.029996531084179878, 0.006218941882252693, 0.04892734810709953, 0.06425125896930695, -0.017792150378227234, 0.041398752480745316, -0.017293022945523262, -0.011015499010682106, -0.02933122031390667, -0.005825115367770195, -0.07212502509355545, 0.10469445586204529, 0.009840304031968117, 0.026172513142228127, 0.002459621522575617, -0.02771947532892227, -0.006639100145548582, -0.04062161594629288, -0.0746249407529831, 0.04523816704750061, -0.07439430058002472, 0.06977812945842743, 0.008738852106034756, 0.06937781721353531, 0.07391723990440369, -0.09542208909988403],
         "POPCNT":[0.032459065318107605, 0.11127372831106186, -0.004006756469607353, 0.06373029947280884, 0.07161973416805267, -0.07966824620962143, -0.014274416491389275, 0.02168503776192665, -0.060636017471551895, -0.051414258778095245, 0.003268218832090497, 0.05552225932478905, 0.01940925046801567, -0.05398592725396156, 0.09021458029747009, -0.060922130942344666, -0.0407782681286335, -0.027882883325219154, 0.012706448324024677, -0.02730434015393257, 0.05854162946343422, -0.0798129290342331, -0.00179530237801373, 0.04958317428827286, -0.04621487483382225, 0.0524308979511261, -0.03889109939336777, 0.07240460813045502, 0.06366933137178421, 0.029314585030078888, -0.014743340201675892, -0.021233027800917625, 0.06803205609321594, -0.01269250176846981, -0.033408213406801224, 0.09638478606939316, -0.02009841799736023, -0.014619074761867523, 0.022498659789562225, 0.006679723970592022, -0.016163295134902, 0.09717728197574615, -0.010882971808314323, -0.09489153325557709, 0.046623144298791885, -0.04596618935465813, -0.026864662766456604, 0.01605546846985817, 0.05979238823056221, -0.024411896243691444, 0.039511535316705704, -0.0108433086425066, -0.05629622936248779, 0.02339898608624935, -0.025785285979509354, 0.011886742897331715, 0.08834438771009445, -0.08506806194782257, 0.021776534616947174, 0.01446699257940054, -0.009117010980844498, -0.022380229085683823, -0.0541100800037384, -0.040569182485342026, -0.02888612262904644, 0.07774273306131363, -0.052350424230098724, -0.039240963757038116, 0.004771160893142223, 0.014987779781222343, -0.05511622130870819, 0.019763313233852386, -0.0920683741569519, 0.021821241825819016, 0.10812623798847198, -0.06422155350446701, -0.07388156652450562, 0.00949418731033802, -0.06905169039964676, 0.006180475000292063, -0.02844754233956337, 0.11084792017936707, -0.03348945826292038, 0.06860767304897308, -0.0214154664427042, -0.0008655296987853944, -0.020698973909020424, 0.03369581326842308, 0.019848104566335678, 0.013533092103898525, 0.03423681482672691, 0.014547858387231827, 0.02418140508234501, -0.013769546523690224, -0.09633788466453552, 0.01689709909260273, -0.01452709175646305, 0.047873757779598236, -0.0012036423431709409, 0.03720762953162193],
@@ -145,7 +135,6 @@
         "RET":[-0.09685279428958893, 0.0101965656504035, -0.04206235706806183, -0.05282443389296532, 0.050776951014995575, -0.006812752690166235, 0.09618920832872391, 0.04637071117758751, -0.018928129225969315, -0.04118828848004341, -0.06039129197597504, -0.018619466572999954, -0.07845143973827362, -0.14034120738506317, -0.03397035226225853, -0.028233898803591728, -0.08162513375282288, 0.048710327595472336, -0.04177732393145561, -0.08455172181129456, 0.00312337395735085, 0.03531079366803169, -0.057201240211725235, 0.09391707926988602, -0.02847883477807045, 0.01840023323893547, -0.04936904460191727, 0.027487540617585182, 0.08041024953126907, -0.08714525401592255, 0.11963017284870148, -0.0762581005692482, -0.06482874602079391, 0.038007382303476334, -0.003661463735625148, 0.0064629544503986835, -0.08281382918357849, -0.053177930414676666, 0.01966426707804203, -0.04822755232453346, -0.0474051795899868, 0.026990806683897972, -0.057971399277448654, 0.12347304075956345, -0.02745792828500271, 0.0832793116569519, 0.03029884397983551, -0.032751865684986115, -0.022912420332431793, -0.030569355934858322, 0.0971289873123169, 0.07298070192337036, 0.0306894201785326, 0.05817654728889465, 0.005174126010388136, 0.042281877249479294, 0.01975836046040058, -0.11205509305000305, 0.05081645026803017, 0.0034761943388730288, -0.03858469799160957, 0.007316718343645334, 0.07441510260105133, 0.004579664673656225, -0.021868426352739334, 0.01116174180060625, 0.061042461544275284, 0.029598504304885864, -0.06691239774227142, 0.03223221376538277, 0.0867755264043808, 0.05488765984773636, -0.019738517701625824, -0.030367519706487656, -0.06396497040987015, -0.0022451707627624273, -0.06131305173039436, -0.03129804506897926, -0.05657076835632324, 0.009733426384627819, -0.08145039528608322, -0.09049411863088608, 0.004821183159947395, 0.038612931966781616, -0.019062234088778496, -0.021097682416439056, -0.06061801686882973, 0.019766775891184807, 0.0276743546128273, -0.057942990213632584, -0.033430278301239014, 0.0043391571380198, 0.05848158895969391, 0.0826464518904686, 0.09988056123256683, -0.05677378550171852, -0.11326800286769867, 0.051275406032800674, 0.01158174965530634, 0.04368240013718605],
         "ROL":[0.026423713192343712, 0.08523924648761749, 0.005345864687114954, 0.027778491377830505, 0.06572498381137848, 0.056946828961372375, -0.03009108640253544, -0.05564097315073013, 0.07753216475248337, -0.07402804493904114, -0.05589171126484871, -0.050976503640413284, 0.041095346212387085, -0.06708681583404541, 0.08517566323280334, 0.02110634744167328, -0.027871981263160706, 0.0005450723110698164, 0.07511565834283829, 0.0016275837551802397, 0.04902505874633789, 0.024746844545006752, 0.08780711144208908, -0.06167766824364662, 0.06365402787923813, 0.06462119519710541, -0.04920244216918945, 0.056112516671419144, 0.10561680048704147, 0.07879003882408142, 0.03879575803875923, -0.03582729026675224, 0.004805437754839659, 0.030719229951500893, 0.0558336041867733, 0.04387545958161354, 0.020841658115386963, 0.015068157576024532, -0.008266274817287922, 0.05914990231394768, -0.01581275276839733, 0.060716625303030014, -0.02257946878671646, 0.00995479617267847, 0.002104438142850995, 0.03806104138493538, 0.010437156073749065, 0.039603881537914276, -0.02074524573981762, 0.024094516411423683, 0.031944990158081055, -0.07122939079999924, -0.023190783336758614, 0.006518832873553038, -0.04528677463531494, -0.02354210615158081, -0.03518632799386978, -0.07059651613235474, -0.017474880442023277, 0.06688393652439117, -0.07900173962116241, -0.05843310430645943, 0.05351021885871887, -0.05724814161658287, -0.02697751857340336, -0.031128596514463425, 0.03040527179837227, -0.009157841093838215, -0.07642515003681183, -0.042137425392866135, -0.031383614987134933, -0.07586777210235596, 0.0489036925137043, 0.0657171905040741, 0.027123138308525085, 0.034842655062675476, -0.035231154412031174, 0.009778738021850586, -0.06150955334305763, 0.042132262140512466, 0.08945925533771515, -0.07213590294122696, -0.0518047958612442, -0.07094760239124298, 0.07041053473949432, 0.1046413779258728, 0.02394813485443592, -0.014966128394007683, -0.04967860132455826, -0.03941388055682182, -0.10642798990011215, -0.03915626183152199, -0.10921923071146011, -0.035421375185251236, 0.039855729788541794, 0.04145469889044762, -0.025123557075858116, 0.06743432581424713, -0.02060243859887123, 0.02994687482714653],
         "ROR":[-0.03797177970409393, -0.03406170755624771, -0.014866529032588005, -0.002243943279609084, 0.024476991966366768, -0.08789698034524918, 0.02924288995563984, -0.03145875036716461, -0.0030907171312719584, 0.013303312472999096, 0.05823688209056854, 0.06085257977247238, 0.0682583823800087, 0.06680850684642792, -0.0008473473135381937, 0.056926507502794266, 0.05309343710541725, 0.017690004780888557, -0.028605103492736816, 0.02303914539515972, -0.07054196298122406, -0.011117611080408096, -0.0012138717574998736, -0.0877937376499176, -0.005339651368558407, -0.029197875410318375, -0.06283852458000183, 0.00677055399864912, 0.07529082894325256, -0.005144342314451933, -0.03930655121803284, -0.0469868965446949, 0.06799482554197311, -0.013870766386389732, -0.07353825122117996, -0.10425472259521484, 8.023920236155391e-05, 0.05196760594844818, -0.024758316576480865, -0.03249195218086243, -0.0037688545417040586, -0.0033505349420011044, 0.04382188990712166, 0.035679250955581665, -0.04743441194295883, 0.031142324209213257, -0.04255860671401024, -0.02310662344098091, -0.04199622571468353, -0.034439221024513245, -0.06397263705730438, -0.011049525812268257, -0.055776823312044144, 0.039233505725860596, 0.016644736751914024, -0.08737850934267044, 0.0151174021884799, 0.10728199779987335, -0.0006503594922833145, -0.060365013778209686, -0.05337308719754219, -0.021152105182409286, 0.06532585620880127, -0.00926337018609047, -0.08149554580450058, 0.0485830195248127, 0.034749776124954224, -0.05045035108923912, -0.06366241723299026, 0.0544571727514267, 0.07594002038240433, 0.027496861293911934, -0.047294747084379196, 0.017491186037659645, -0.034639474004507065, 0.006060798652470112, -0.07335491478443146, -0.054728057235479355, -0.0018357941880822182, -0.07110298424959183, 0.09072742611169815, 0.03083305060863495, 0.054598040878772736, -0.028097454458475113, -0.012821618467569351, 0.008708478882908821, -0.06561881303787231, -0.04448843002319336, 0.08860815316438675, -0.050312310457229614, 0.09012935310602188, -0.004711236339062452, -0.020932462066411972, -0.10615857690572739, -0.005630030296742916, 0.03976801037788391, 0.040199730545282364, 0.07235082983970642, -7.448523683706298e-05, 0.076942078769207],
-        "RegisterMask":[0.009287647902965546, 0.029691029340028763, -0.03465871885418892, 0.032606374472379684, -0.007339544594287872, 0.03367740660905838, -0.0661492720246315, 0.0436118021607399, -0.002896533813327551, 0.028440887108445168, -0.06791415065526962, 0.004055356606841087, -0.01596181094646454, -0.003846745239570737, 0.06762582808732986, -0.025632556527853012, 0.08132420480251312, 0.025554664433002472, -0.08994632959365845, 0.02521730400621891, 0.023826507851481438, 0.0004487193073146045, 0.01047397032380104, 0.03246957063674927, -0.033482909202575684, 0.05051224306225777, 0.005778896156698465, -0.0006257061613723636, 0.00522293895483017, -0.04666636884212494, 0.022335125133395195, -0.022150320932269096, 0.04510439187288284, -0.02769547514617443, 0.026804683730006218, 0.0710473507642746, -0.014513042755424976, 0.0695318952202797, 0.048469461500644684, -0.008654370903968811, -0.028613079339265823, -0.02918054349720478, -0.022721733897924423, -0.0004791628452949226, 0.011470172554254532, 0.08561886101961136, 0.07125027477741241, -0.05847848951816559, 0.011811288073658943, -0.025244031101465225, -0.03665035218000412, -0.03482883796095848, 0.04196881502866745, 0.06909161061048508, 0.02365143597126007, -0.0689089447259903, -0.0707414448261261, -0.03962424397468567, -0.025703679770231247, 0.06502455472946167, 0.057676125317811966, 0.026916807517409325, 0.024921152740716934, 0.009799988009035587, -0.018656229600310326, 0.009880480356514454, -0.06516153365373611, 0.019290866330266, 0.02236226759850979, -0.02598695270717144, -0.00299705658107996, 0.019448822364211082, -0.014883329160511494, 0.06645222008228302, -0.028751512989401817, -0.01589173451066017, 0.026225939393043518, 0.07285763323307037, -0.06037987396121025, -0.027615630999207497, -0.039930179715156555, -0.07122864574193954, 0.029825787991285324, 0.026364129036664963, -0.04438399150967598, 0.07015394419431686, -0.013950555585324764, 0.004367176443338394, 0.020521124824881554, 0.02030497044324875, 0.011951270513236523, 0.06765977293252945, -0.015042259357869625, 0.005189584568142891, -0.07532864063978195, -0.010886142030358315, 0.006792030762881041, -0.06348442286252975, 0.031859394162893295, -0.052482619881629944],
         "SAR":[-0.058561697602272034, -0.014889497309923172, -0.009758144617080688, 0.00019282882567495108, -0.040600407868623734, -0.05907759070396423, 0.033052023500204086, -0.04672614857554436, -0.050173744559288025, -0.06619776040315628, 0.005385559983551502, 0.05449973791837692, -0.0035163976717740297, -0.12835650146007538, 0.06576846539974213, 0.030572880059480667, -0.014856431633234024, 0.011252024210989475, 0.018954169005155563, -0.10070347040891647, -0.032273050397634506, 0.007221086882054806, -0.020879192277789116, 0.0691007450222969, 0.01286559458822012, -0.020694725215435028, -0.07545264810323715, -0.07742343097925186, -0.005103116389364004, 0.10223732143640518, -0.08521754294633865, 0.07459715753793716, 0.006563629489392042, -0.059839747846126556, -0.023294325917959213, 0.04265525937080383, -0.011012998409569263, -0.02257128618657589, -0.033783379942178726, 0.0368407666683197, -0.048024341464042664, -0.037417128682136536, 0.09010431170463562, 0.09016482532024384, -0.07939734309911728, 0.03274676203727722, 0.0388714037835598, -0.03253694251179695, 0.020820122212171555, -0.0039061333518475294, 0.025425976142287254, -0.01847209222614765, 0.013026821427047253, 0.08873090147972107, -0.010358930565416813, -0.026935681700706482, 0.04795868322253227, -0.06173045188188553, -0.02299962192773819, -0.09966729581356049, 0.008027775213122368, 0.03202224150300026, -0.08922284096479416, 0.03263246268033981, 0.0702379047870636, 0.08681228011846542, -0.053993936628103256, 0.0009890834335237741, -0.060423459857702255, 0.08636976033449173, 0.04784319922327995, 0.05135124549269676, -0.023515762761235237, 0.015414481982588768, -0.06941155344247818, 0.004289102740585804, -0.10909571498632431, 0.014149827882647514, -0.025285568088293076, 0.06270574778318405, 0.0669349953532219, 0.03599094599485397, 0.0436582937836647, 0.06281902641057968, -0.04479018226265907, -0.04126136004924774, -0.026938045397400856, -0.0349077507853508, 0.002964549232274294, -0.04247729107737541, 0.009402072057127953, 0.10574454814195633, 0.03262042999267578, 0.08030910044908524, -0.031244831159710884, 0.010621835477650166, -0.02628093585371971, 0.046942535787820816, -0.022998474538326263, 0.009223603643476963],
         "SBB":[-0.040700677782297134, -0.01474229246377945, 0.09491399675607681, 0.015464535914361477, -0.05408482998609543, -0.09618491679430008, -0.014700816012918949, -0.06255258619785309, 0.09308589994907379, 0.01991264335811138, 0.04899228736758232, -0.03322140499949455, -0.03979090601205826, -0.161369189620018, 0.0957769826054573, -0.045866891741752625, -0.03776619955897331, 0.09559016674757004, -0.0063005415722727776, 0.07086999714374542, -0.004713557660579681, 0.10066409409046173, -0.053719762712717056, 0.07039386034011841, 0.01788068749010563, 0.01069885678589344, -0.003849055850878358, 0.07810717821121216, 0.10748977214097977, -0.09462521225214005, -0.06140149384737015, -0.028434589505195618, 0.0395897701382637, 0.05396975204348564, 0.009982907213270664, -0.014297235757112503, 0.018435295671224594, -0.04264533147215843, -0.0471954308450222, -0.008587008342146873, 0.010918513871729374, -0.03147284686565399, 0.08885594457387924, 0.05178891867399216, 0.05807363614439964, 0.028190992772579193, 0.04205470532178879, 0.00935433991253376, 0.027427801862359047, -0.02180725708603859, -0.06614664196968079, 0.021269382908940315, 0.0585390068590641, 0.12827278673648834, 0.0420454666018486, 0.06753493845462799, -0.05479112267494202, -0.06480395793914795, 0.02621031180024147, -0.07586188614368439, -0.04831313341856003, 0.016674980521202087, -0.006851759273558855, 0.04103298485279083, 0.005965645890682936, -0.02317493036389351, -0.03966135531663895, -0.02576862834393978, -0.0916895642876625, 0.029451601207256317, 0.044677067548036575, 0.026928072795271873, -0.10388721525669098, 0.021140936762094498, -0.06990157812833786, 0.048356350511312485, -0.08890967816114426, -0.0003503488842397928, -0.10245566070079803, -0.0582563653588295, 0.04677841439843178, 0.04697449132800102, -0.04022470489144325, 0.02759086713194847, -0.02867579087615013, 0.013355317525565624, 0.011504339054226875, -0.04230086877942085, -0.045500747859478, -0.03741880878806114, 0.022458063438534737, 0.05192841589450836, 0.008104681968688965, -0.08284809440374374, 0.059996478259563446, 0.07762005180120468, -0.0031316280364990234, 0.06990513950586319, -0.020328091457486153, -0.0027387691661715508],
         "SETB_C":[-0.007473401725292206, -0.06315194815397263, 0.0693482831120491, 0.05207814276218414, -0.08006429672241211, -0.005448522046208382, -0.007457572966814041, 0.011581258848309517, -0.05411145091056824, -0.06738752871751785, 0.013233165256679058, -0.0677611380815506, 0.01846255734562874, -0.09321920573711395, -0.03116961196064949, 0.05861300230026245, -0.001519175828434527, 0.08354826271533966, 0.023905213922262192, 0.0124649154022336, 0.08983863890171051, 0.055941760540008545, 0.07229111343622208, 0.09052376449108124, -0.013718990609049797, 0.06642850488424301, 0.0822976604104042, 0.010060268454253674, 0.04116540774703026, -0.03301406651735306, 0.07296404242515564, -0.03534134477376938, 0.012426529079675674, -0.005412430968135595, 0.06087784096598625, 0.03547677770256996, -0.007232111878693104, 0.06580550968647003, -0.0037480974569916725, 0.02971699647605419, -0.06937503069639206, 0.08572175353765488, -0.02138090692460537, -0.0053040217608213425, -0.029469722881913185, -0.05332958698272705, 0.10073655843734741, -0.03199373558163643, -0.01775289885699749, -0.09716105461120605, 0.06483447551727295, 0.028643250465393066, -0.029914388433098793, 0.007070464547723532, 0.006640028208494186, -0.0033612342085689306, 0.005682659335434437, 0.011877131648361683, -0.038144148886203766, 0.03381858021020889, 0.02083616890013218, 0.029199717566370964, 0.07813020050525665, -0.006173993926495314, -0.016444502398371696, -0.08474857360124588, 0.03877300024032593, -0.046462398022413254, 0.02460806630551815, 0.053950369358062744, 0.01389766950160265, 0.03323421627283096, 0.04349416866898537, 0.04381947219371796, 0.10320119559764862, -0.1117740124464035, 0.03045269101858139, -0.03870442137122154, -0.07607249915599823, -0.00020808610133826733, -0.09519094228744507, 0.06727365404367447, -0.04469249024987221, 0.07144048810005188, -0.08811240643262863, 0.001203814405016601, 0.06901863217353821, 0.05462682247161865, -0.03902207687497139, -0.05885632708668709, -0.028275305405259132, -0.07151838392019272, -0.059166230261325836, -0.015570566058158875, 0.06314826756715775, -0.040293656289577484, 0.021595094352960587, -0.04083842411637306, -0.09180022031068802, -0.0309903621673584],
@@ -263,26 +252,6 @@
         "VINSERTI":[0.09046315401792526, 0.015515458770096302, 0.04200809448957443, -0.046130646020174026, -0.045843131840229034, 0.003743539797142148, 0.025380345061421394, -0.021319231018424034, 0.03850293532013893, 0.006397924851626158, -0.06982530653476715, -0.016159888356924057, -0.09164588898420334, 0.1245846077799797, -0.05104857683181763, -0.011446121148765087, -0.06936608999967575, -0.02683587372303009, -0.0337526798248291, -0.005495472811162472, 0.023584537208080292, -0.0771733894944191, -0.026287894695997238, -0.01172191184014082, 0.09737047553062439, 0.0375351756811142, 0.03280220180749893, -0.014072075486183167, 0.06032971292734146, 0.0072259255684912205, -0.08368974179029465, 0.054626062512397766, -0.021156134083867073, -0.09647785127162933, 0.07431179285049438, -0.09039300680160522, -0.07652204483747482, -0.002478789770975709, -0.012967151589691639, 0.08174770325422287, -0.00968913547694683, 0.015551612712442875, 0.08655177801847458, -0.056927114725112915, -0.011370684020221233, -0.0408773347735405, 0.04413295164704323, 0.05919815972447395, 0.08101782202720642, -0.008914918638765812, -0.019233090803027153, 0.05211508646607399, -0.010292282328009605, 0.021839600056409836, 0.0016950241988524795, -0.031931016594171524, 0.004831018857657909, 0.015328328125178814, -0.015326892025768757, -0.05457184836268425, 0.03782501816749573, -0.014512602239847183, 0.00869232788681984, -0.04001179710030556, -0.00994281005114317, -0.041689563542604446, 0.060574647039175034, 0.044912341982126236, 0.05958174169063568, -0.035378437489271164, 0.08524063974618912, 0.012326095253229141, -0.052227456122636795, -0.015090923756361008, -0.012893415056169033, -0.019565775990486145, -0.03284028172492981, -0.02651887759566307, 0.02436136268079281, 0.004743371158838272, 0.019924448803067207, -0.046163417398929596, 0.005615816451609135, -0.03354670852422714, 0.00801338255405426, 0.02501787059009075, 0.03313247114419937, -0.012842012569308281, 0.04856807366013527, -0.031942710280418396, -0.026277944445610046, 0.11483185738325119, 0.015686793252825737, 0.052031729370355606, 0.025188622996211052, -0.021448785439133644, 0.05062439665198326, -0.030834710225462914, 0.02746596559882164, 0.027027780190110207],
         "VINSERTPSrm":[0.010265239514410496, 0.03508870303630829, 0.0182888712733984, 0.01066108699887991, 0.09608251601457596, -0.0390457920730114, -0.02508910745382309, -0.051061324775218964, 0.051924970000982285, 0.02405509166419506, 0.07347054034471512, -0.023432396352291107, 0.03053455613553524, 0.11051331460475922, -0.07987380027770996, -0.07169938087463379, -0.06944284588098526, -0.010227087885141373, 0.01555782649666071, 0.0033066831529140472, -0.017572278156876564, 0.018880389630794525, -0.03347453102469444, 0.023936258628964424, 0.04189354181289673, -0.008910899050533772, 0.045309878885746, 0.039228133857250214, -0.0026367944665253162, 0.01713910512626171, -0.0038225038442760706, -0.02550170198082924, -0.04479162022471428, -0.11607012152671814, -0.05566885322332382, -0.03926549106836319, -0.05618799477815628, 0.0587196871638298, -0.003744689514860511, 0.09148088097572327, -0.008691483177244663, 0.060393813997507095, 0.05017181858420372, 0.05314680561423302, 0.010222898796200752, 0.04390108212828636, 0.06256565451622009, -0.039335936307907104, -0.030927496030926704, -0.010439696721732616, -0.09615844488143921, 0.04857023432850838, -0.021018074825406075, -0.04949686676263809, 0.0718517154455185, -0.0008082279819063842, -0.05119258537888527, -0.016725104302167892, -0.031902290880680084, 0.07473913580179214, -0.07040376961231232, -0.06263279169797897, -0.01866966485977173, -0.04580819234251976, -0.0018242622027173638, 0.02124813199043274, 0.01608111523091793, 0.033293697983026505, 0.04724595695734024, 0.06764823198318481, -0.010222701355814934, -0.09166357666254044, 0.0065320758149027824, 0.03907076641917229, 0.014404546469449997, -0.04371245950460434, -0.036747194826602936, -0.013570152223110199, -0.04874530807137489, -0.048001520335674286, 0.015114152804017067, 0.018710903823375702, 0.06920618563890457, -0.04024452716112137, 0.07851467281579971, -0.03879975154995918, 0.039278119802474976, -0.06678346544504166, -0.02139596827328205, 0.03483150899410248, 0.047795433551073074, -0.08071790635585785, 0.05023014172911644, -0.05863310769200325, 0.04391729459166527, -0.006796867586672306, -0.07156652212142944, -0.016402525827288628, 0.07568787783384323, 0.01640038751065731],
         "VINSERTPSrr":[0.022902479395270348, 0.024018414318561554, 0.011011038906872272, 0.009072761051356792, -0.057881347835063934, 0.03739643841981888, 0.006768766324967146, 0.07734010368585587, 0.001509909750893712, 0.043335434049367905, -0.07244917005300522, -0.1078081876039505, 0.027186766266822815, 0.018834171816706657, 0.007436624728143215, -0.048498328775167465, 0.09450934827327728, -0.015420452691614628, 0.014672537334263325, -0.0012827727477997541, 0.019664635881781578, -0.026565955951809883, -0.04819858446717262, -0.0004387270309962332, 0.01676507107913494, -0.0014571163337677717, 0.015105879865586758, 0.04038102179765701, 0.008408628404140472, 0.07757255434989929, -0.09923559427261353, -0.04181523248553276, -0.0313955582678318, -0.006045420188456774, 0.05904707312583923, -0.014993838034570217, 0.03219055384397507, 0.058543696999549866, -0.06872320920228958, 0.021718619391322136, -0.08984571695327759, 0.06557019799947739, -0.018167613074183464, -0.011413732543587685, 0.036035921424627304, 0.10104569792747498, 0.05836406350135803, -0.02576756477355957, 0.03827998414635658, -0.06874323636293411, 0.01668366976082325, 0.048310816287994385, -0.010213772766292095, -0.035550933331251144, -0.03385040909051895, 0.004614332225173712, 0.018951643258333206, 0.10679180175065994, -0.019135646522045135, -0.011955377645790577, 0.028140606358647346, -0.08185642957687378, -0.015775075182318687, -0.011507326737046242, 0.07914295047521591, -0.030148068442940712, 0.11757981777191162, 0.00040086795343086123, 0.056880321353673935, -0.014461426064372063, -0.0008378245402127504, -0.06245473772287369, -0.05332277715206146, -0.038401950150728226, 0.005011103581637144, 0.0368003323674202, -0.021230563521385193, 0.01497745979577303, 0.0372738242149353, 0.07988940924406052, -0.013381360098719597, 0.0036820468958467245, 0.07501927763223648, -0.0996084213256836, 0.028014982119202614, -0.09410325437784195, 0.0007335525006055832, 0.004884959198534489, 0.040397197008132935, -0.07655651122331619, 0.05677357316017151, 0.005359896458685398, -0.047478366643190384, 0.03828851506114006, 0.03363208472728729, -0.041756175458431244, -0.00031817640410736203, 0.022837577387690544, 0.039567966014146805, 0.03662540763616562],
-        "VIRT_REG_FR32":[0.0034248235169798136, -0.011980761773884296, -0.0501178540289402, 0.0494888611137867, 0.06103336811065674, -0.06178610771894455, 0.007709897588938475, -0.011392943561077118, 0.06570645421743393, 0.0771368145942688, 0.0005577280535362661, 0.013396150432527065, -0.041660163551568985, 0.05122360959649086, 0.11354377865791321, -0.009875510819256306, -0.06466709822416306, 0.048170577734708786, 0.0007201629341579974, 0.06538223475217819, 0.08870227634906769, -0.05771782249212265, 0.009273379109799862, -0.03325295075774193, 0.01197165809571743, 0.06604835391044617, 0.08265330642461777, -0.005758166313171387, 0.02512396313250065, 0.03383670747280121, 0.038484204560518265, -0.06539343297481537, -0.013461028225719929, 0.001498897559940815, 0.05170154944062233, 0.06965786963701248, -0.07339458167552948, 0.05094756931066513, 0.01983451284468174, -0.06855696439743042, 0.07892709225416183, 0.06099703162908554, 0.08492864668369293, 0.05357863008975983, -0.009294840507209301, -0.0054923719726502895, -0.029938997700810432, 0.028260599821805954, 0.053790509700775146, -0.06574371457099915, -0.009621666744351387, -0.08131514489650726, -0.08474338054656982, 0.039622966200113297, 0.06945627927780151, 0.02545306645333767, 0.005390701815485954, 0.04582791030406952, -0.1103447750210762, -0.050917647778987885, 0.03087870217859745, 0.06918162852525711, 0.0548822283744812, -0.01838473603129387, 0.05597897991538048, 0.03548860549926758, -0.009931124746799469, -0.07856663316488266, 0.033994875848293304, 0.03467561677098274, 0.09580692648887634, -0.04153195023536682, -0.06732118874788284, -0.06857144832611084, 0.03419093042612076, -0.01200241968035698, -0.06983492523431778, 0.05929506942629814, -0.00041734304977580905, -0.026396293193101883, 0.05230500176548958, -0.006162640172988176, 0.044198282063007355, -0.028765834867954254, 0.031155114993453026, 0.06967037916183472, -0.0892564132809639, 0.028816571459174156, -0.037065472453832626, 0.06540130823850632, -0.01888667233288288, 0.030632384121418, 0.0359313078224659, 0.106044240295887, 0.03259910270571709, -0.0775517001748085, -0.04267778620123863, 0.04977935180068016, -0.01790289767086506, -0.11223265528678894],
-        "VIRT_REG_FR64":[0.08496882021427155, 0.049308884888887405, -0.016840212047100067, 0.010602951049804688, -4.6025739720789716e-05, -0.06524767726659775, 0.048670798540115356, -0.06444543600082397, -0.0031944462098181248, 0.05608433857560158, -0.03958145156502724, 0.05171080678701401, -0.03572545200586319, -0.054364755749702454, 0.052311528474092484, -0.0361458919942379, 0.024109655991196632, 0.15923210978507996, -0.07255382835865021, -0.011799084022641182, -0.06846465915441513, 0.0023571476340293884, 0.02642918936908245, -0.05057685822248459, 0.029800178483128548, -0.06036723777651787, -0.012272411957383156, -0.022802220657467842, -0.02426644042134285, 0.05623406544327736, -0.07506053894758224, -0.02078152634203434, 0.02549685165286064, -0.030025657266378403, -0.0627482682466507, 0.062375299632549286, 0.03684084117412567, 0.06365678459405899, 0.0004415051080286503, -0.002180535811930895, 0.05225013941526413, -0.0693102702498436, -0.03649357333779335, 0.005159272346645594, -0.03298519179224968, 0.041419681161642075, -0.05325934663414955, -0.017585784196853638, -0.03843431547284126, -0.002649943344295025, 0.033329058438539505, -0.04736043140292168, -0.043852102011442184, -0.06713785231113434, -0.03237355872988701, 0.012679073959589005, -0.01959240809082985, 0.07324203103780746, 0.07468831539154053, 0.03327644243836403, -0.01596391387283802, 0.12015434354543686, 0.051839299499988556, 0.00980563648045063, -0.08275608718395233, 0.04445798322558403, -0.03891860321164131, 0.10891054570674896, -0.008730625733733177, -0.051655255258083344, -0.05982912331819534, 0.04106972739100456, 0.06872759014368057, 0.013289053924381733, 0.03469584137201309, -0.06673429906368256, -0.0695682018995285, 0.047426726669073105, 0.02815094031393528, -0.05552271753549576, 0.0010567272547632456, -0.051840681582689285, -0.01704293303191662, -0.047185055911540985, 0.036965738981962204, 0.03452568128705025, -0.05430837720632553, 0.0383443646132946, 0.0003438846324570477, -0.030417989939451218, 0.02749026007950306, -0.0546082966029644, 0.03005768544971943, 0.0025131346192210913, 0.0013019279576838017, -0.054173994809389114, -0.008382225409150124, 0.02153395675122738, 0.011912085115909576, -0.10461334884166718],
-        "VIRT_REG_GR16":[0.09543223679065704, 0.03513967618346214, 0.08986528217792511, -0.012217407114803791, -0.02076001651585102, -0.04190119728446007, 0.01318269595503807, -0.010142332874238491, -0.011869532987475395, -0.040446147322654724, 0.06552371382713318, 0.04439055174589157, 0.08176156878471375, -0.06334159523248672, -0.033928077667951584, -0.00024628525716252625, 0.0244551170617342, -0.019419007003307343, -0.09592454880475998, 0.005961012560874224, 0.03278326243162155, -0.07028506696224213, -0.08484592288732529, -6.329250754788518e-05, 0.015018146485090256, -0.05068608745932579, 0.0732998326420784, 0.023434389382600784, 0.0002124009479302913, 0.060401707887649536, 0.013626078143715858, -0.010556582361459732, -0.005069760140031576, -0.004616749472916126, -0.034329116344451904, 0.060584329068660736, -0.05430089309811592, -0.029179023578763008, 0.042385730892419815, -0.0652197003364563, 0.09378205984830856, -0.05090794339776039, -0.008510591462254524, 0.0837036669254303, 0.009071480482816696, 0.04464874789118767, -0.012855015695095062, 0.06306030601263046, -0.08556588739156723, -0.05393703281879425, -0.06741822510957718, -0.03717748448252678, 0.017156923189759254, 0.07401604950428009, -0.06629005819559097, -0.04564857482910156, -0.055414989590644836, 0.039407771080732346, -0.04089723527431488, 0.06915309280157089, 0.030190052464604378, 0.027542876079678535, 0.03557966649532318, 0.05191207677125931, -0.03237364813685417, -0.02036256715655327, -0.071859210729599, -0.06704329699277878, 0.0336633175611496, 0.09511569887399673, 0.0048662531189620495, 0.05273270234465599, -0.056247059255838394, 0.06079721450805664, -0.04150049015879631, -0.08104457706212997, -0.10303051024675369, 0.04522428661584854, -0.04379847273230553, -0.019447194412350655, 0.0021319733932614326, -0.010465282015502453, 0.06857019662857056, -0.00443653529509902, -0.08039603382349014, -0.05012141168117523, 0.0875077098608017, -0.03053239732980728, -0.05321606993675232, 0.016501901671290398, -0.0563507042825222, -0.03187479078769684, -0.0015389680629596114, 0.022985411807894707, -0.05008963868021965, 0.028300117701292038, 0.02875804342329502, -0.024458128958940506, -0.022238614037632942, -0.049835607409477234],
-        "VIRT_REG_GR32":[-0.008479167707264423, -0.02941126376390457, 0.05343153327703476, 0.03769504278898239, -0.0006716987118124962, -0.0329299233853817, 0.03442851081490517, -0.06826753169298172, -0.09117511659860611, -0.018657755106687546, 0.029032904654741287, 0.02404048666357994, 0.010598761960864067, -0.0482308566570282, 0.06956348568201065, -0.027967501431703568, -0.07380961626768112, -0.021098148077726364, -0.0808446854352951, 0.0127912862226367, -0.01355082169175148, -0.040285225957632065, 0.035385165363550186, -0.001157263875938952, -0.026462145149707794, -0.08616211265325546, -0.044482193887233734, -0.010969695635139942, 0.04645564407110214, -0.018178211525082588, -0.038536932319402695, -0.027571648359298706, -0.007523007690906525, -0.02699458785355091, -0.039170436561107635, 0.12889482080936432, -0.04512789845466614, -0.03883056715130806, 0.051210880279541016, 0.03924906626343727, 0.036943964660167694, -0.016879307106137276, 0.011263007298111916, 0.053573690354824066, -0.018964825198054314, -0.041856080293655396, -0.036545924842357635, 0.07715532928705215, -0.041981130838394165, -0.04114629328250885, -0.04393022507429123, -0.030163627117872238, 0.0019487979589030147, 0.10988762229681015, 0.09039165079593658, -0.0035424421075731516, -0.06272851675748825, 0.007701062131673098, -0.01971622183918953, 0.06203003600239754, 0.048561323434114456, -0.04599940404295921, 0.00802221056073904, -0.002905400237068534, -0.1050020381808281, 0.003395768813788891, -0.07973644882440567, 0.008020970039069653, -0.08614815771579742, 0.0518532320857048, 0.021174483001232147, 0.03254232555627823, -0.01905026100575924, -0.0009989180834963918, -0.06409642845392227, -0.022425753995776176, -0.03563409671187401, 0.07717793434858322, -0.04553033784031868, -0.02112392708659172, -0.002374667674303055, 0.03828585892915726, -0.014221777208149433, -0.015974245965480804, -0.01805220916867256, 0.04202109947800636, -0.0841534212231636, 0.06608130037784576, -0.11586519330739975, 0.024179989472031593, 0.017091574147343636, 0.08567194640636444, -0.03692129999399185, 0.03266705200076103, -0.046154942363500595, 0.0040525165386497974, -0.03177625685930252, 0.039895471185445786, 0.042960215359926224, -0.05573953315615654],
-        "VIRT_REG_GR32_ABCD":[0.016604775562882423, -0.0028934956062585115, 0.041060179471969604, -0.025077441707253456, -0.018642406910657883, 0.023762650787830353, -0.028646549209952354, -0.02460283786058426, 0.005985732190310955, 0.01774146780371666, -0.004014404024928808, -0.05473850294947624, -0.0417158380150795, -0.06322457641363144, 0.060795728117227554, -0.036435071378946304, -0.04245952516794205, 0.08069344609975815, 0.035319335758686066, -0.012020719237625599, 0.045771341770887375, -0.10842540860176086, 0.046253710985183716, -0.004099135287106037, 0.030616935342550278, -0.08288344740867615, 0.08569363504648209, -0.014164377935230732, -0.004303323570638895, 0.09726760536432266, 0.06208871304988861, -0.04007713496685028, 0.005815347656607628, 0.02377200312912464, 0.07813961058855057, 0.03192306309938431, -0.006230524741113186, 0.10110925883054733, -0.023409254848957062, 0.030774405226111412, -0.011607645079493523, -0.03929119184613228, 0.004817614797502756, -0.013827506452798843, 0.07770339399576187, -0.07994075864553452, -0.03157062083482742, 0.06743781268596649, 0.014881699346005917, -0.030165214091539383, -0.07844353467226028, -0.04563238099217415, 0.09747181832790375, 0.057128582149744034, 0.04173563793301582, -0.0011194447288289666, -0.01902887038886547, -0.032171595841646194, 0.04824799671769142, 0.008433254435658455, 0.024706291034817696, 0.0746094286441803, 0.04515853151679039, -0.0018984260968863964, -0.10070884972810745, -0.01883143000304699, -0.07785795629024506, 0.10938235372304916, -0.08001448959112167, -0.07419873028993607, 0.010544849559664726, 0.025767439976334572, -0.1005895584821701, 0.05103800818324089, -0.03675306960940361, -0.020510872825980186, 0.022482097148895264, 0.06463642418384552, -0.03149804100394249, -0.021647030487656593, 0.04025804623961449, 0.003628256032243371, 0.03532547131180763, -0.08667688816785812, 0.018817460164427757, -0.01690257526934147, -0.10114696621894836, -0.022815177217125893, 0.024386661127209663, 0.10286301374435425, 0.030005114153027534, 0.0370776504278183, -0.008584428578615189, -0.077603779733181, -0.03588058054447174, 0.030617419630289078, -0.07383710891008377, 0.03215676173567772, 0.03288266062736511, -0.036702848970890045],
-        "VIRT_REG_GR32_NOREX":[0.019052108749747276, -0.006784944795072079, -0.05410394072532654, 0.001966317882761359, -0.06686867773532867, 0.013514372520148754, 0.030097918584942818, -0.03868359327316284, 0.004314934369176626, -0.06713679432868958, 0.02491898462176323, 0.027683967724442482, 0.035907283425331116, -0.023093875497579575, -0.0892200842499733, -0.1052003800868988, -0.03923499956727028, 0.08808581531047821, -0.10092058777809143, 0.03336786851286888, -0.08974049985408783, -0.015254802070558071, 0.039686985313892365, -0.010083628818392754, -0.03423550724983215, -0.08821681141853333, -0.05621311068534851, -0.020327769219875336, -0.016793876886367798, 0.08908043801784515, -0.04112761467695236, -0.050139520317316055, -0.01524045504629612, 0.05841142684221268, 0.08270087838172913, 0.0348736047744751, -0.016146546229720116, 0.05751227214932442, 0.05081859603524208, -0.07304663956165314, -0.047101784497499466, -0.02825125865638256, 0.0006340605323202908, 0.0008785317186266184, -0.044239338487386703, 0.007173972204327583, -0.029449066147208214, 0.07254412025213242, -0.026029080152511597, 0.025982191786170006, -0.09524690359830856, -0.052613094449043274, -0.1270490437746048, 0.05319184809923172, 0.1046818196773529, 0.0477570965886116, -0.06291303783655167, 0.04725426062941551, -0.05330964922904968, 0.04056742787361145, 0.01543382927775383, 0.03627128154039383, -0.048232536762952805, 0.014761016704142094, -0.007380587514489889, -0.008060632273554802, -0.021923277527093887, -0.022500980645418167, -0.08495079725980759, 0.045358967036008835, -0.04728720709681511, 0.03550735488533974, 0.03445536270737648, -0.01891610585153103, -0.09439470618963242, -0.044266197830438614, -0.07952893525362015, 0.05221104994416237, -0.03507477045059204, 0.04218391329050064, 0.040326621383428574, -0.0395088866353035, 0.02447870559990406, -0.04280063137412071, 0.06520935893058777, -0.003358252113685012, -0.057561881840229034, 0.01911463774740696, 0.05295571684837341, 0.030342884361743927, 0.03814920783042908, -0.03366788476705551, 0.03090745024383068, 0.09487249702215195, -0.002995486371219158, -0.012020634487271309, -0.029147809371352196, 0.09558248519897461, 0.02548893168568611, 0.0931544378399849],
-        "VIRT_REG_GR64":[0.02717440389096737, -0.026730243116617203, -0.023244258016347885, 0.04027782380580902, 0.006808254402130842, -0.027519788593053818, -0.01906559243798256, 0.027793627232313156, -0.00129543652292341, -0.03455121070146561, 0.021734628826379776, 0.035481199622154236, -0.07251942157745361, -0.025691546499729156, -0.03271827474236488, -0.13225725293159485, -0.0601421520113945, 0.09084498882293701, -0.10225717723369598, 0.004034099169075489, 0.023578351363539696, -0.041603971272706985, 0.04199974611401558, -0.014711204916238785, -0.04272732138633728, -0.12534455955028534, -0.023738788440823555, 0.005328727886080742, 0.038416482508182526, -0.026419155299663544, -0.041119154542684555, 0.00022502713545691222, -0.05204978585243225, -0.019709734246134758, -0.04102563485503197, 0.06480151414871216, 0.009224721230566502, 0.04627599939703941, 0.027821402996778488, -0.05595114827156067, 0.04526345059275627, 0.024196594953536987, 0.10446277260780334, 0.07561361789703369, -0.08028160035610199, -0.0314163975417614, 0.11944323033094406, 0.1025814488530159, -0.08457476645708084, 0.02227119728922844, -0.041679076850414276, -0.02260834351181984, 0.036674268543720245, 0.10488750785589218, 0.019218411296606064, -0.015966340899467468, -0.06852715462446213, 0.026523491367697716, -0.11090730130672455, -0.0021082640159875154, -0.048291631042957306, -0.032388005405664444, 0.015713853761553764, 0.03355225548148155, -0.06502845883369446, -0.010098783299326897, -0.09930021315813065, -0.017413528636097908, -0.055861033499240875, 0.0801810696721077, -0.03900628536939621, -0.03278445452451706, -0.0337282195687294, -0.11434067040681839, -0.04371264949440956, -0.01736009307205677, -0.05100121721625328, 0.07490750402212143, -0.014680330641567707, -0.02126181870698929, 0.018013890832662582, 0.0018135658465325832, 0.029781077057123184, -0.012477489188313484, -0.021443217992782593, 0.047576501965522766, -0.05993758141994476, -0.06040889024734497, 0.016642581671476364, 0.011624492704868317, -0.042229063808918, -0.007573941722512245, -0.04010608047246933, -0.006444427650421858, -0.014495199546217918, -0.04122597724199295, -0.08505907654762268, -0.004049300216138363, 0.06545045226812363, -0.04762336611747742],
-        "VIRT_REG_GR64_ABCD":[0.04577033221721649, -0.07758746296167374, 0.00799313560128212, -0.11011485010385513, -0.010862522758543491, 0.012709266506135464, 0.05257265642285347, -0.07354705780744553, 0.04262387007474899, 0.07554348558187485, -0.06358839571475983, 0.006669520866125822, 0.049098193645477295, 0.11183933168649673, -0.028112098574638367, 0.021986473351716995, -0.02839403599500656, -0.06199958547949791, 0.08614487200975418, -0.041216861456632614, 0.041238460689783096, 0.005937385838478804, 0.00200703926384449, -0.05337367579340935, 0.037919919937849045, -0.07485998421907425, -0.09153831005096436, -0.0554175041615963, -0.10251995176076889, -0.01289951242506504, -0.030631467700004578, 0.04197017475962639, -0.03578301519155502, 0.010593005456030369, -0.05836241692304611, 0.06809061765670776, 0.10871735960245132, -0.09833388775587082, -0.009873395785689354, -0.056898634880781174, 0.05946199968457222, 0.015534073114395142, 0.01677171140909195, -0.020233800634741783, -0.006396631710231304, -0.049332089722156525, 0.012649210169911385, 0.03756912052631378, 0.0033660116605460644, -0.09084216505289078, -0.07142844051122665, -0.0030346515122801065, 0.0019640070386230946, 0.038837920874357224, 0.011760945431888103, 0.04995080456137657, -0.06997165083885193, -0.035297296941280365, 0.01996617764234543, 0.01954355463385582, -0.0934600979089737, 0.030165065079927444, -0.007337240036576986, -0.05346155911684036, 0.0732186883687973, -0.04716489836573601, -0.06555212289094925, -0.018465254455804825, 0.051119767129421234, -0.03106619231402874, 0.0748852789402008, -0.02095886692404747, 0.006320921704173088, 0.03146332502365112, -0.08238139003515244, -0.03618254140019417, -0.014570276252925396, 0.062481846660375595, -0.0394093319773674, -0.05171547457575798, -0.044726233929395676, -0.01228095218539238, 0.09699232876300812, 0.07471026480197906, 0.03112417459487915, 0.022543631494045258, -0.08634103089570999, 0.059702761471271515, -0.013801504857838154, 0.004984616301953793, 0.045798566192388535, -0.03205988556146622, -0.06150995194911957, -0.02244667150080204, 0.03318532556295395, 0.03462471440434456, 0.03236381709575653, 0.0884014293551445, -0.01604369841516018, -0.05234146490693092],
-        "VIRT_REG_GR64_NOREX":[-0.03959479182958603, -0.06190898269414902, -0.02920372597873211, -0.09973344951868057, -0.004333901684731245, -0.08522991091012955, 0.0459987074136734, -0.057674553245306015, 0.037046968936920166, -0.05669403821229935, -0.02221340872347355, -0.062426190823316574, 0.05804889276623726, -0.02635439857840538, -0.045627325773239136, 0.03632078319787979, 0.07128578424453735, 0.07544906437397003, -0.0537678524851799, -0.04624016210436821, 0.014316501095890999, 0.05580946058034897, 0.05251356214284897, -0.08244197070598602, -0.08901460468769073, -0.07641059905290604, -0.04924754425883293, 0.05417120084166527, -0.0060508353635668755, -0.00814742036163807, -0.06154030188918114, 0.05966867506504059, -0.03231468051671982, 0.021429890766739845, 0.031103987246751785, 0.04343251883983612, -0.08997714519500732, 0.039365898817777634, 0.052908625453710556, -0.02683917060494423, -0.05547752603888512, -0.014131218194961548, 0.0016863569617271423, -0.041112788021564484, -0.010230163112282753, -0.06687774509191513, -0.006144971586763859, -0.08074352145195007, 0.04034091532230377, -0.08176303654909134, -0.004055786412209272, -0.0024839320685714483, -0.007289807312190533, 0.06915127485990524, 0.023709064349532127, 0.04671626538038254, 0.06229325756430626, 0.04707597941160202, 0.06800796836614609, -0.02885584905743599, 0.030613983049988747, -0.019083039835095406, 0.045457858592271805, 0.040770504623651505, -0.05441175401210785, -0.05712401866912842, 0.07744520157575607, -0.0756613239645958, -0.06890957802534103, -0.07997069507837296, 0.09348486363887787, -0.04511028528213501, 0.036194607615470886, 0.040017660707235336, 0.016245214268565178, 0.023104460909962654, 0.058383163064718246, 0.0679842159152031, -0.00921112485229969, -0.10036550462245941, 0.09075804799795151, -0.059704095125198364, -0.013338442891836166, -0.005139742512255907, 0.07807526737451553, 0.06255412846803665, -0.008151572197675705, -0.0624256506562233, 0.012590888887643814, 0.03665084019303322, -0.028498578816652298, -0.01614067517220974, 0.007552243769168854, -0.007216903381049633, 0.0760180801153183, -0.04200543463230133, 0.06412865966558456, -0.05136435106396675, -0.0024792966432869434, 0.06856651604175568],
-        "VIRT_REG_GR64_NOREX_NOSP":[-0.0656895712018013, 0.058077458292245865, -0.006653467658907175, 0.037784356623888016, 0.07274001836776733, 0.07232078164815903, 0.07074914127588272, 0.05637859180569649, 0.04296007752418518, 0.05499762296676636, -0.01783664897084236, -0.08387365937232971, -0.01376343984156847, -0.07938199490308762, -0.027822256088256836, -0.0663403570652008, 0.036170270293951035, -0.07460261881351471, 0.08652043342590332, 0.02483147382736206, -0.07939319312572479, 0.033202506601810455, 0.0903514102101326, -0.10181311517953873, 0.060751549899578094, 0.07619930803775787, 0.05017509311437607, -0.0470910519361496, 0.07713821530342102, -0.0426195003092289, -0.04506472498178482, 0.003363420255482197, -0.0017315347213298082, 0.06264199316501617, 0.005245774984359741, -0.027923958376049995, 0.09868567436933517, 0.06738796830177307, -0.10339145362377167, 0.0020383980590850115, 0.087734155356884, 0.011040030047297478, -0.05993311479687691, -0.05790332704782486, 0.01574312523007393, 0.009771298617124557, 0.022676382213830948, -0.009197148494422436, 0.03372732177376747, 0.08404259383678436, -0.015135225839912891, -0.04693703353404999, 0.09917140752077103, 0.007134507410228252, 0.020209072157740593, -0.00027669535484164953, -0.0351635180413723, 0.03751315921545029, -0.019665181636810303, 0.028500953689217567, 0.034186746925115585, -0.005931361112743616, 0.05645192414522171, -0.02027188241481781, -0.022675039246678352, -0.08812297880649567, -0.014896178618073463, -0.048788342624902725, 0.008708382956683636, 0.019917558878660202, -0.002275944221764803, 0.03409638628363609, 0.033304013311862946, 0.057676300406455994, 0.039842985570430756, -0.025169866159558296, 0.016520975157618523, -0.030201178044080734, -0.021718870848417282, -0.07023277878761292, -0.007528252899646759, 0.009067370556294918, -0.0460657961666584, 0.07117785513401031, -0.03609836474061012, -0.011893372051417828, -0.006047600414603949, 0.0179970171302557, 0.024480223655700684, -0.03918423503637314, 0.004897980485111475, 0.05040167644619942, 0.010113563388586044, -0.1074901670217514, -0.06277655810117722, -0.02934161201119423, -0.06922926008701324, -0.05638887360692024, 0.05314395949244499, 0.04588884115219116],
-        "VIRT_REG_GR64_NOSP":[0.0015277941711246967, -0.03938478231430054, -0.030811766162514687, 0.027071669697761536, 0.02127140760421753, 0.0015787228476256132, -0.07842491567134857, 0.004658385645598173, -0.05909501388669014, -0.03576778993010521, -0.07251477241516113, 0.12117832154035568, 0.04499363154172897, -0.009405314922332764, -0.01015283353626728, -0.002841090550646186, 0.0689091831445694, 0.10697457194328308, -0.09274765104055405, -0.027955353260040283, -0.0379958301782608, -0.044126156717538834, 0.04907212778925896, -0.038063473999500275, -0.003686746582388878, -0.08313410729169846, -0.045181579887866974, -0.011702840216457844, -0.006579228211194277, 0.046807315200567245, -0.045654296875, -0.03466613590717316, -0.08313826471567154, -0.06678880006074905, -0.027727074921131134, 0.036734677851200104, -0.040936414152383804, 0.05170389637351036, 0.038199927657842636, 0.02960256300866604, 0.0355701707303524, -0.02052776888012886, 0.06218089163303375, 0.10570456087589264, -0.036479029804468155, -0.008999336510896683, -0.031860992312431335, 0.07250168174505234, -0.061084795743227005, -0.057996805757284164, -0.010533110238611698, -0.018169214949011803, 0.017261315137147903, 0.10023517906665802, -0.044131457805633545, -0.07618662714958191, -0.09124933928251266, 0.01819406822323799, -0.05906827375292778, 0.04295642301440239, -0.03197735920548439, 0.03641442954540253, 0.005168464966118336, -0.00010972691961796954, -0.0829579159617424, -0.014677388593554497, -0.08750011026859283, -0.04695136100053787, -0.07696729153394699, -0.00718996487557888, 0.018294518813490868, -0.014321570284664631, -0.04416860267519951, -0.0890057235956192, -0.014466283842921257, 0.02831638976931572, -0.04845190420746803, 0.08228176832199097, 0.03420877829194069, 0.056510377675294876, 0.037403274327516556, 0.04364967346191406, 0.08903267979621887, -0.016827082261443138, -0.0682789757847786, 0.06286796927452087, -0.0958203598856926, 0.018489282578229904, 0.02886355295777321, 0.028006011620163918, 0.039986785501241684, -0.04771937429904938, -0.004648604430258274, 0.033939141780138016, -0.027820419520139694, -0.026187442243099213, -0.07972361892461777, 0.006323353853076696, 0.016448041424155235, -0.01961681991815567],
-        "VIRT_REG_GR64_NOSP_and_GR64_TC":[0.08079065382480621, -0.05147358775138855, -0.08338657021522522, 0.06757336109876633, -0.015237463638186455, 0.026806311681866646, 0.07564966380596161, -0.037159934639930725, -0.02222878858447075, -0.04553138092160225, -0.006632891017943621, 0.001604291144758463, 0.043711669743061066, 0.0710049569606781, -0.08854726701974869, -0.03142566233873367, -0.0865127220749855, 0.08521236479282379, 0.039203498512506485, 0.04737624153494835, 0.02893459051847458, 0.004120660945773125, 0.03552098199725151, -0.0010448878165334463, 0.04423774778842926, 0.03258584439754486, 0.03433830663561821, -0.019990455359220505, -0.03263172507286072, 0.09782663732767105, -0.00702365068718791, -0.06544602662324905, 0.013447105884552002, 0.04603038728237152, 0.029931804165244102, 0.0988783910870552, -0.062023941427469254, -0.0070026409812271595, 0.032557111233472824, -0.08212000876665115, 0.03199682757258415, 0.020828546956181526, 0.07071725279092789, -0.018812179565429688, -0.0184739138931036, -0.06008931249380112, 0.01504000648856163, -0.019235603511333466, 0.014653048478066921, -0.009083813987672329, 0.03171474114060402, 0.019499456509947777, 0.05263463407754898, 0.10554639250040054, -0.02759619802236557, -0.00156494346447289, -0.03898271545767784, 0.06027846410870552, -0.061001915484666824, 0.039365388453006744, -0.06546281278133392, 0.0006352368509396911, 0.0500405877828598, -0.03232716768980026, -0.010176514275372028, 0.002549059921875596, 0.0666508674621582, -0.037290267646312714, -0.028836704790592194, 0.06271649152040482, -0.016647985205054283, 0.013602355495095253, 0.020110899582505226, 0.011730309575796127, -0.10071564465761185, -0.06239647418260574, -0.09507977962493896, -0.09190725535154343, -0.08861985802650452, -0.0006123466300778091, 0.0951915979385376, -0.035364676266908646, -0.04007220268249512, 0.08415472507476807, 0.0006664254469797015, 0.05864431709051132, 0.01460045762360096, -0.09507087618112564, 0.024228032678365707, 0.04208158329129219, 0.006106846500188112, 0.09294755011796951, 0.06157369166612625, 0.0826527327299118, -0.058974966406822205, -0.09958664327859879, 0.06913749873638153, -0.08108915388584137, 0.07425157725811005, 0.04784728214144707],
-        "VIRT_REG_GR64_TC":[-0.0944172665476799, 0.040403831750154495, -0.017597073689103127, 0.04766053333878517, -0.03104357235133648, 0.025751160457730293, 0.036779265850782394, -0.0235747080296278, 0.032111138105392456, 0.009872193448245525, -0.01596468687057495, 0.05234881862998009, -0.047335200011730194, 0.005157034378498793, -0.02132921665906906, -0.0544377863407135, 0.057515472173690796, -0.006743279751390219, -0.01474941335618496, -0.0990658849477768, 0.022418741136789322, -0.007098495960235596, 0.046933863312006, 0.1002131924033165, 0.01583809033036232, 0.03995800018310547, -0.017743254080414772, -0.01684877835214138, 0.06543229520320892, 0.04597911611199379, 0.05365373566746712, -0.008774830959737301, -0.01341968309134245, -0.004754040390253067, 0.04739849269390106, 0.032378777861595154, -0.0020728895906358957, 0.03502136841416359, 0.05946416035294533, -0.06190952658653259, 0.01910495012998581, -0.023678753525018692, 0.012653682380914688, -0.06766874343156815, -0.0729866623878479, 0.0757005363702774, -0.027033904567360878, -0.06776778399944305, -0.010131776332855225, -0.06334701925516129, -0.04702980816364288, 0.06837917864322662, 0.002726735547184944, 0.04345812648534775, 0.04288078844547272, -0.06921732425689697, -0.07625382393598557, 0.037991974502801895, -0.04257906600832939, 0.06338586658239365, 0.05315309390425682, -0.02785014547407627, 0.04054750129580498, 0.06967299431562424, -0.07271680235862732, 0.0032969408202916384, -0.08254148811101913, 0.07269596308469772, -0.01827111467719078, 0.034775473177433014, 0.010106234811246395, 0.0389409065246582, 0.042805008590221405, -0.03822058066725731, 0.0668339803814888, -0.005216705612838268, -0.00022202919353730977, -0.0221820380538702, -0.027401722967624664, -0.045061662793159485, -0.05296671763062477, -0.0190189890563488, -0.002744461875408888, -0.04073096439242363, -0.06974441558122635, 0.05868958309292793, -0.06907399743795395, -0.026619713753461838, 0.015318086370825768, 0.035948701202869415, -0.08301021158695221, 0.03955607861280441, 0.028369972482323647, 0.0202812347561121, -0.12075140327215195, -0.039504438638687134, -0.03826067969202995, 0.01607581228017807, 0.02135113812983036, -0.08897850662469864],
-        "VIRT_REG_GR64_TC_with_sub_8bit":[0.00805664248764515, 0.06228634715080261, -0.005148644559085369, -0.025605352595448494, -0.04853198677301407, -0.018169978633522987, 0.008530518971383572, -0.1050964742898941, -0.08428415656089783, -0.014802628196775913, 0.05918573588132858, 0.07529161125421524, 0.09815273433923721, -0.014188972301781178, 0.06676790118217468, 0.09496084600687027, -0.03843621164560318, -0.00740150036290288, -0.11988909542560577, -0.01781499572098255, -0.03719411790370941, -0.07447166740894318, 0.005513608455657959, -0.014381160028278828, 0.036786310374736786, -0.04839075356721878, -0.009440913796424866, 0.03984222561120987, -0.08096668124198914, 0.026751000434160233, 0.06400448083877563, 0.07998895645141602, 2.295125523232855e-05, 0.0266779325902462, -0.0030931613873690367, 0.05236855521798134, -0.010479471646249294, -0.011119752191007137, -0.06124376133084297, -0.019449712708592415, 0.03448517248034477, -0.04095051810145378, 0.01377212442457676, 0.09643338620662689, 0.021325431764125824, 0.06029453128576279, 0.048866767436265945, -0.03436344116926193, -0.043422505259513855, 0.03822150453925133, 0.004718889016658068, -0.04090931639075279, -0.04219569265842438, 0.019032739102840424, 0.06111171841621399, 0.04305591061711311, -0.0379939004778862, -0.03224434703588486, -0.06517905741930008, 0.002272483194246888, 0.09273418039083481, -0.028145847842097282, 0.01824336126446724, 0.00936606340110302, -0.07281909137964249, -0.028650810942053795, -0.060721538960933685, -0.09477518498897552, -0.0014060320099815726, 0.06919887661933899, -0.03463669493794441, 0.0026504716370254755, -0.0653621107339859, -0.02800566703081131, -0.02503957599401474, -0.060285311192274094, 0.014794053509831429, -0.08424058556556702, 0.0482206828892231, -0.07467620074748993, -0.09909844398498535, -0.06888734549283981, -0.0014173799427226186, -0.09022543579339981, 0.06461413204669952, 0.024526789784431458, -0.07400602847337723, -0.008816084824502468, 0.025513656437397003, 0.047476526349782944, -0.05981749668717384, 0.08338218182325363, 0.02657591737806797, 0.03547860309481621, -0.043622229248285294, 0.10129662603139877, 0.08802521973848343, -0.09759330749511719, 0.025680232793092728, 0.05964493378996849],
-        "VIRT_REG_GR64_with_sub_16bit_in_GR16_NOREX":[-0.03117012232542038, -0.02872271090745926, -0.039712607860565186, 0.03738812729716301, 0.030099159106612206, 0.00013636364019475877, -0.019107641652226448, -0.04186702147126198, -0.053099144250154495, -0.020432034507393837, -0.0004185919533483684, 0.010934959165751934, 0.036054231226444244, 0.03788067027926445, 0.05227302014827728, -0.034505825489759445, -0.08298061788082123, 0.0399160161614418, 0.03668724000453949, 0.014606554992496967, -0.0071771652437746525, 0.059049926698207855, -0.06330917030572891, 0.007379058748483658, -0.0750177726149559, -0.0423760749399662, -0.019386067986488342, -0.018436923623085022, -0.015116279944777489, 0.023602722212672234, 0.0533282607793808, -0.026401247829198837, 0.023750485852360725, -0.027648568153381348, -0.016443056985735893, 0.04291580244898796, -0.04391908273100853, 0.05113501846790314, -0.03743087872862816, 0.056367188692092896, 0.048130668699741364, -0.0230261143296957, 0.03358393907546997, -0.030188169330358505, 0.08421863615512848, 0.0033821314573287964, 0.03151029348373413, -0.042818162590265274, 0.04007953777909279, -0.0050337472930550575, 0.03335743024945259, -0.026563530787825584, 0.016440672799944878, -0.04272226244211197, -0.07304228097200394, 0.024836458265781403, -0.016342775896191597, -0.055494848638772964, -0.05826134234666824, 0.027478834614157677, 0.025981346145272255, -0.04745938256382942, 0.013695796020328999, -0.027888784185051918, 0.03769542649388313, -0.024486247450113297, 0.04720773920416832, -0.012697651982307434, -0.03559652715921402, 0.012948199175298214, -0.025600459426641464, 0.014954420737922192, -0.06651762872934341, 0.04277091473340988, -0.08291683346033096, 0.016881149262189865, 0.04145864024758339, -0.04162050038576126, -0.03363965451717377, -0.05018439516425133, 0.06321889907121658, -0.00871780700981617, 0.06867428869009018, 0.057975344359874725, 0.009704249911010265, 0.049075234681367874, -0.06111253425478935, 0.027943406254053116, 0.03725599870085716, 0.032480716705322266, -0.01960119605064392, -0.0295172780752182, 0.014026675373315811, 0.056797921657562256, -0.031707022339105606, 0.0010152219329029322, -0.023705823346972466, -0.07695567607879639, 0.017504720017313957, -0.0020094760693609715],
-        "VIRT_REG_GR64_with_sub_8bit":[-0.011493992060422897, -0.027181852608919144, 0.022013556212186813, 0.05687474459409714, -0.03289574757218361, -0.04803529754281044, -0.04204253479838371, 0.044671084731817245, -0.0849028080701828, -0.09561576694250107, 0.03596775606274605, 0.027156801894307137, 0.05034027621150017, -0.006308000069111586, 0.012393618933856487, -0.048590339720249176, -0.049129705876111984, 0.059305012226104736, -0.10330235958099365, 0.00738809397444129, 0.03855152800679207, -0.03220852091908455, 0.05221837759017944, -0.01274650078266859, 0.024303985759615898, -0.05925533175468445, -0.015623844228684902, -0.025864524766802788, 0.009918035939335823, 0.004779431037604809, -0.02866589091718197, 0.006512579973787069, -0.037251196801662445, 0.005028596147894859, -0.011677909642457962, 0.051886074244976044, -0.03552602231502533, 0.011968757025897503, 0.00829426757991314, -0.06981230527162552, -0.029781555756926537, -0.012621275149285793, 0.08595969527959824, 0.08630531281232834, 0.10018875449895859, -0.054863955825567245, -0.044519901275634766, 0.0893385037779808, 0.04004377871751785, 0.003711731405928731, -0.021447300910949707, -0.08500636368989944, 0.0037281641270965338, 0.14561010897159576, 0.03993009030818939, 0.07621612399816513, 0.020513180643320084, 0.004926605150103569, -0.035578932613134384, 0.06101486086845398, -0.08422145247459412, -0.03511432558298111, 0.01537742093205452, -0.010146304965019226, -0.05133780837059021, -0.010472903028130531, -0.09726933389902115, -0.010570867918431759, -0.09348491579294205, 0.002129049738869071, -0.01265127956867218, 0.03504374623298645, -0.008679943159222603, -0.002507386729121208, -0.06586045026779175, -0.04775359109044075, -0.042809367179870605, 0.08359787613153458, -0.0230431966483593, -0.015440763905644417, 0.0195400882512331, -0.0186530202627182, -0.03176320344209671, -0.019522372633218765, -0.02984560839831829, 0.024256182834506035, -0.07656785100698471, 0.03944750130176544, 0.016559945419430733, 0.007124909665435553, 0.08061631768941879, 0.08561833202838898, -0.018525447696447372, -0.0019649232272058725, -0.018469924107193947, -0.012311050668358803, -0.08448101580142975, 0.060216110199689865, 0.06368701905012131, -0.07110093533992767],
-        "VIRT_REG_GR8":[0.02255251444876194, 0.012649326585233212, 0.05363747105002403, -0.006129346787929535, 0.027027001604437828, 0.03703385218977928, -0.045294541865587234, -0.02489621751010418, 0.026587747037410736, -0.06228360906243324, 0.01547946222126484, 0.03494448587298393, 0.08276952058076859, -0.03470698744058609, 0.0036826131399720907, 0.04216131567955017, -0.04518325626850128, 0.09584730118513107, -0.09126991778612137, -0.11293632537126541, 0.0141398124396801, -0.05086163431406021, 0.0421922467648983, -0.0001364851341350004, 0.05821910500526428, -0.04154132679104805, 0.036521218717098236, -0.016718950122594833, 0.0773339569568634, 0.05134757608175278, -0.03728386387228966, -0.014684299007058144, 0.016949277371168137, 0.025767508894205093, -0.01573120802640915, 0.0343811996281147, 0.008209497667849064, 0.0011038129450753331, -0.06688684970140457, -0.08167136460542679, 0.03875276446342468, 0.08301592618227005, 0.023012684658169746, 0.07135005295276642, 0.008461466059088707, 0.004998552612960339, 0.02622731775045395, -0.09479465335607529, 0.014987453818321228, -0.008574756793677807, -0.008050303906202316, -0.005560623947530985, 0.04616820812225342, 0.11537269502878189, 0.032199542969465256, 0.05507092550396919, -0.053164780139923096, 0.012255114503204823, -0.01981479674577713, 0.06012535095214844, 0.043957680463790894, 0.02384384348988533, 0.04837791621685028, 0.04945961385965347, -0.1063770279288292, -0.07354240119457245, -0.08922741562128067, -0.026019031181931496, -0.08768662065267563, 0.09241457283496857, 0.03253300115466118, -0.018267929553985596, -0.04406850412487984, -0.05577726289629936, -0.05304105579853058, 0.016035545617341995, 0.05610279366374016, 0.06247573718428612, -0.019430609419941902, -0.017088554799556732, -0.022114543244242668, 0.07442588359117508, -0.017668865621089935, -0.02403153106570244, 0.006919574458152056, 0.05879344418644905, -0.0885634645819664, -0.016336753964424133, -0.024662213400006294, 0.029266972094774246, -0.04889025166630745, 0.042460259050130844, -0.013102580793201923, 0.023992935195565224, 0.024768078699707985, 0.047551900148391724, -0.02243787795305252, 0.05929713696241379, 0.03110451251268387, -0.00550821190699935],
-        "VIRT_REG_RFP80":[-0.04414765536785126, 0.05147779360413551, -0.035608600825071335, -0.03939598798751831, 0.0430026613175869, -0.03331028297543526, 0.015591064468026161, 0.01892651617527008, -0.011428372003138065, -0.06980786472558975, 0.06445881724357605, 0.1036338210105896, 0.01164929661899805, -0.07599718868732452, 0.022036561742424965, 0.10396245121955872, -0.041171155869960785, -0.07264886051416397, 0.00032837275648489594, 0.02848120965063572, -0.031889040023088455, 0.023848745971918106, -0.02298046089708805, -0.05559201166033745, 0.026687605306506157, 0.0565699003636837, -0.0134252505376935, 0.05494402348995209, -0.0584089457988739, 0.05422470346093178, -0.024360226467251778, 0.03570455685257912, 0.013681530021131039, -0.006910417694598436, 0.011886067688465118, 0.07619262486696243, 0.08147607743740082, 0.05824091285467148, 0.001224246108904481, -0.030463339760899544, -0.023527851328253746, 0.03078501485288143, -0.02225799672305584, -0.058049511164426804, 0.015403151512145996, 0.07900431007146835, 0.025944147258996964, 0.021455328911542892, 0.023985104635357857, -0.0327906534075737, 0.04195002466440201, -0.10313323140144348, -0.023333510383963585, -0.010316243395209312, -0.02042137086391449, 0.07474000751972198, 0.02313513681292534, -0.0030733307357877493, 0.06138097122311592, 0.005197131074965, -0.03222955763339996, 0.005364845506846905, -0.05313501134514809, 0.0013082564109936357, 0.025044983252882957, 0.0349799208343029, 0.09704083949327469, -0.017403649166226387, -0.03375721350312233, 0.05970870703458786, -0.021679691970348358, -0.04719642922282219, 0.024217652156949043, -0.06130526587367058, 0.004813425708562136, 0.07473690062761307, -0.039600174874067307, -0.009295261465013027, 0.05440402403473854, 0.04785943776369095, -0.04006686061620712, -0.020133933052420616, 0.00989031046628952, -0.054447200149297714, 0.06291327625513077, -0.01196430902928114, 0.0841275230050087, -0.05557875707745552, -0.0813804343342781, -0.0746457576751709, -0.024255990982055664, -0.048101916909217834, -0.014132879674434662, -0.013147399760782719, -0.009715595282614231, 0.08717820793390274, -0.04318689927458763, -0.0311901792883873, -0.017253845930099487, 0.005144816357642412],
-        "VIRT_REG_VR128":[0.08292517066001892, 0.053138989955186844, 0.0019234063802286983, -0.030035940930247307, 0.0821828693151474, -0.0540342852473259, 0.06449387222528458, -0.03985493257641792, 0.026820721104741096, 0.0352952741086483, -0.1056072935461998, 0.054804764688014984, 0.01685425080358982, 0.05867069214582443, 0.11665259301662445, -0.07655566930770874, 0.021201618015766144, 0.00927705504000187, -0.04723019897937775, 0.016582123935222626, -0.01160470675677061, -0.013075411319732666, 0.01054342370480299, -0.05403316020965576, 0.033609066158533096, -0.07971179485321045, 0.1005927175283432, -0.020655132830142975, -0.0036442605778574944, 0.018269486725330353, 0.036334097385406494, -0.06517180055379868, -0.028530113399028778, -0.03768114373087883, 0.10582506656646729, 0.011199450120329857, -0.06707775592803955, 0.02332702837884426, -0.014528930187225342, -0.09369251132011414, 0.069722481071949, 0.031001657247543335, 0.08032777905464172, -0.060744334012269974, 0.015131807886064053, 0.01935953088104725, -0.087028868496418, 0.041773099452257156, 0.0381581112742424, -0.07518653571605682, 0.021307995542883873, -0.07350508868694305, -0.04699733853340149, -0.007377162110060453, 0.07836157828569412, 0.016066696494817734, -0.02160775288939476, -0.030519334599375725, -0.09255059063434601, 0.03597188740968704, -0.11260625720024109, -0.08602424710988998, 0.058293748646974564, -0.034749604761600494, 0.005541469436138868, -0.07924741506576538, -0.024103455245494843, 0.06047135218977928, 0.026729481294751167, 0.03493977710604668, -0.07453227788209915, -0.01716521382331848, 0.008985077030956745, -0.08075122535228729, 0.03353623300790787, -0.08125714957714081, 0.04245763644576073, 0.06520543247461319, 0.020550349727272987, -0.003161275526508689, -0.03491697832942009, -0.005496494937688112, 0.09021904319524765, -0.057418785989284515, 0.03494826331734657, -0.052578359842300415, -0.044952504336833954, 0.11770184338092804, -0.048565153032541275, -0.03815764561295509, 0.06020108237862587, -0.09397949278354645, 0.03820547088980675, 0.08039405196905136, 0.014751153998076916, 0.006572262849658728, 0.05658692866563797, 0.05043925344944, -0.0060436660423874855, -0.12018798291683197],
-        "VIRT_REG_VR256":[0.032775089144706726, 0.029240285977721214, 0.01821955479681492, 0.023595772683620453, -0.02587016113102436, -0.12190376222133636, 0.09720813482999802, 0.005780891049653292, -0.0581410676240921, 0.04817686229944229, -0.04627984017133713, 0.03618951886892319, -0.10393846780061722, 0.04380590096116066, 0.030101926997303963, -0.021811308339238167, 0.0012455569813027978, 0.06209835410118103, -0.08859474956989288, 0.0671553835272789, -0.006448917090892792, 0.0169842429459095, 0.031113164499402046, -0.07417412847280502, 0.05549546331167221, -0.013042094185948372, 0.0948401540517807, -0.07335975021123886, -0.03987044095993042, -0.005343804135918617, -0.08741248399019241, -0.08009110391139984, 0.005667346995323896, 0.03745159134268761, 0.019986214116215706, -0.03723142296075821, -0.0037649653386324644, 0.005682446528226137, 0.0659727230668068, -0.002658356446772814, 0.07049102336168289, -0.01944110542535782, -0.014278342947363853, 0.04189611226320267, 0.0312303826212883, -0.046760618686676025, 0.040438465774059296, 0.054074693471193314, 0.07479880005121231, -0.016405146569013596, 0.027125591412186623, -0.04216836765408516, 0.0011189498472958803, -0.01471384521573782, -0.010250975377857685, -0.006412460468709469, -0.12170380353927612, 0.015495882369577885, -0.054699406027793884, 0.05955614894628525, 0.06753991544246674, -0.03688138723373413, 0.049010518938302994, -0.07614680379629135, 0.06504888087511063, -0.014145595952868462, 0.02210555598139763, 0.023598313331604004, 0.00511248828843236, 0.013318972662091255, -0.11605404317378998, -0.032067783176898956, -0.05010659247636795, -0.023693162947893143, 0.06650379300117493, -0.026386691257357597, 0.06052805855870247, 0.0515507273375988, 0.033960308879613876, -0.06421340256929398, -0.09355985373258591, -0.0658700093626976, 0.10278744995594025, -0.10271084308624268, -0.012089421041309834, -0.04169749841094017, -0.07112454622983932, -0.032573599368333817, -0.0003141233173664659, 0.017007946968078613, 0.03622191399335861, 0.05829676240682602, 0.06261610984802246, 0.005667738616466522, 0.009631159715354443, 0.022852277383208275, 0.057013869285583496, -0.05015721917152405, 0.027599012479186058, -0.08637165278196335],
         "VMASKMOVPDYmr":[-0.04878474771976471, 0.009688055142760277, 0.05428608879446983, -0.030850162729620934, 0.03008297272026539, 0.03831377625465393, -0.023454757407307625, 0.061062078922986984, -0.07177434861660004, 0.003681673901155591, 0.040161218494176865, -0.009652352891862392, 0.07261710613965988, -0.010966332629323006, -0.013221205212175846, -0.03301544487476349, 0.04829031974077225, -0.08083753287792206, 0.030231673270463943, -0.02659734897315502, -0.036777157336473465, 0.06681652367115021, 0.01175805926322937, 0.06305940449237823, -0.019296150654554367, 0.02796877548098564, -0.029999401420354843, -0.0198240764439106, -0.04471949115395546, -0.06781838089227676, 0.024380704388022423, 0.03754236921668053, 0.06767786294221878, 0.04803696274757385, 0.046649131923913956, 0.04538867995142937, -0.028629129752516747, 0.0127564687281847, 0.004995361436158419, -0.08728974312543869, 0.029057662934064865, 0.07067801058292389, 0.0007887053652666509, 0.019237162545323372, -0.04447153955698013, -0.10583364218473434, 0.08983936905860901, 0.015038984827697277, -0.034384895116090775, -0.055098336189985275, -0.07670909911394119, 0.002524072304368019, 0.10086455941200256, 0.022610867395997047, 0.05591642111539841, -0.07907918840646744, -0.04253252223134041, 0.05387851223349571, -0.034182146191596985, -0.08478306978940964, -0.039358172565698624, 0.05872701108455658, 0.0004980096709914505, -0.054017916321754456, -0.05543661117553711, -0.05234605073928833, -0.01648441143333912, -0.039598412811756134, 0.014009279198944569, 0.07753992825746536, -0.024791967123746872, 0.0015941763995215297, -0.08564147353172302, 0.015439499169588089, 0.04659571126103401, 0.042471837252378464, 0.005456998012959957, 0.015990061685442924, -0.02272135764360428, -0.03891618177294731, -0.0077924951910972595, -0.05113787576556206, 0.040118955075740814, -0.043831776827573776, 0.05283576622605324, 0.09104584157466888, 0.015506122261285782, -0.028880758211016655, -0.0025508899707347155, 0.08238258212804794, -0.011219828389585018, 0.0496247261762619, -0.044287387281656265, 0.050674524158239365, 0.02936738170683384, -0.017218898981809616, 0.07722929865121841, 0.04578819498419762, -0.031120644882321358, -0.022032534703612328],
         "VMASKMOVPSYmr":[0.020578626543283463, -0.004085692577064037, 0.07696651667356491, 0.028803450986742973, -0.006955036427825689, -0.018540993332862854, 0.0719260424375534, 0.09322775900363922, 0.05095001682639122, -0.01811334490776062, 0.01627892442047596, 0.050088733434677124, -0.06736274808645248, 0.025077303871512413, 0.06022811681032181, -0.09305489808320999, -0.09338469058275223, -0.0525103323161602, -0.06159364432096481, 0.030921749770641327, 0.06632588058710098, 0.031169326975941658, 0.016549210995435715, -0.06410345435142517, 0.034944821149110794, 0.01632581278681755, 0.06805131584405899, -0.004622941836714745, -0.02994105964899063, 0.025459013879299164, 0.020487098023295403, 0.06677251309156418, -0.046148937195539474, -0.05847230181097984, -0.0662517175078392, -0.006552667822688818, 0.05338975414633751, -0.07456435263156891, -0.05682503432035446, -0.0720917284488678, -0.08354304730892181, 0.057539310306310654, -0.0984572172164917, -0.015717046335339546, -0.04905203357338905, -0.016580646857619286, 0.030063051730394363, -0.04245767742395401, -0.019089849665760994, 0.037014883011579514, -0.03125334531068802, -0.02194075658917427, 0.057924628257751465, 0.053156934678554535, 0.03154401481151581, -0.03698640316724777, -0.047283731400966644, -0.07787752151489258, -0.09294760227203369, 0.008879968896508217, -0.039479922503232956, 0.06407082825899124, 0.021868228912353516, 0.02621234394609928, -0.05872492864727974, -0.07943505048751831, 0.024682780727744102, 0.014713538810610771, 0.02206231839954853, -0.0664556622505188, -0.08985312283039093, -0.028045928105711937, 0.022865260019898415, -0.03564520925283432, 0.06292934715747833, 0.009946631267666817, 0.031550049781799316, -0.08577742427587509, 0.047102898359298706, -0.07018786668777466, -0.10670997202396393, 0.0016501775244250894, -0.08505392074584961, 0.00861909706145525, -0.06370823830366135, 0.03423767164349556, 0.03173772618174553, 0.019602738320827484, 0.021573755890130997, 0.02385428547859192, -0.01468846295028925, 0.023825718089938164, -0.05538937821984291, 0.05968264490365982, 0.08997872471809387, -0.006320557557046413, 0.012793052941560745, -0.10326020419597626, -0.015349009074270725, 0.006139614153653383],
         "VMASKMOVPSYrm":[-0.09999474138021469, -0.05461611971259117, 0.06111544370651245, 0.009340068325400352, 0.05158305540680885, 0.018409717828035355, 0.03258055821061134, -0.0017857305938377976, 0.041260261088609695, -0.04183795303106308, -0.04711655154824257, 0.007005605846643448, 0.017177876085042953, 0.011972760781645775, -0.058734532445669174, 0.022736912593245506, -0.10794606059789658, 0.029367392882704735, -0.012645614333450794, -0.09590506553649902, -0.07090207934379578, -0.05850019305944443, -0.018024247139692307, -0.0036007456947118044, -0.06459654122591019, 0.009839186444878578, 0.04846305027604103, -0.11106285452842712, 0.029033005237579346, 0.10009876638650894, 0.012796668335795403, -0.0073439814150333405, -0.08754748106002808, -0.037603843957185745, -0.015900349244475365, -0.007158457301557064, 0.03420218825340271, 0.027995899319648743, -0.07699259370565414, 0.042778756469488144, 0.04648644104599953, -0.04391217231750488, -0.018405593931674957, -0.01280362717807293, 0.08530068397521973, -0.03674551844596863, 0.06248623505234718, 0.0038591010961681604, -0.07031620293855667, -0.01702764257788658, 0.005379523150622845, -0.029414091259241104, 0.00011999297566944733, 0.058016858994960785, 0.10091454535722733, 0.07112561911344528, -0.07445680350065231, -0.08252609521150589, -0.05458306148648262, 0.0828995481133461, 0.030287114903330803, 0.08512170612812042, -0.0745752677321434, 0.011145705357193947, 0.07730960845947266, 0.06756677478551865, 0.10192125290632248, -0.015338120050728321, 0.025173967704176903, -0.017697714269161224, 0.00455897580832243, -0.01002852339297533, -0.09001599997282028, 0.06024448946118355, 0.01357717253267765, -0.04349803552031517, 0.026919689029455185, 0.07871785014867783, 0.06163106486201286, -0.02904645912349224, 0.05042176693677902, 0.019180594012141228, -0.029065869748592377, -0.02645217627286911, -0.04180121049284935, -0.01644887775182724, 0.005278781522065401, 0.021325504407286644, 0.0710480809211731, -0.02405066229403019, 0.06883849203586578, -0.08493685722351074, -0.0180019810795784, 0.10276532918214798, -0.04697193205356598, -0.0004998040967620909, 0.014400942251086235, 0.07172509282827377, 0.027445673942565918, 0.04722077399492264],
@@ -673,5 +642,42 @@
         "V_SETALLONES":[0.011805560439825058, 0.005605545360594988, 0.019577916711568832, -0.007038246374577284, -0.013101942837238312, -0.060087915509939194, 0.06600171327590942, 0.1127510741353035, 0.03251935541629791, -0.08513955771923065, -0.1272188425064087, -0.05743984133005142, 0.03415455296635628, -0.01813715696334839, 0.08123213797807693, -0.02604430541396141, 0.004977638833224773, -0.05056260898709297, 0.0759609192609787, -0.03905864432454109, -0.029284782707691193, -0.0773778036236763, -0.06391929090023041, 0.03013690747320652, 0.025567403063178062, -0.04096659645438194, -0.013911372050642967, 0.03076753579080105, 0.09287972748279572, 0.06516721844673157, 0.013303481042385101, -0.05148301273584366, 0.013247961178421974, -0.02087739109992981, -0.06532798707485199, -0.07080436497926712, 0.03797996789216995, -0.05954182893037796, -0.006158157251775265, -0.039611611515283585, 0.016250262036919594, -0.009441757574677467, -0.009183786809444427, 0.16159473359584808, 0.08712765574455261, -0.022884182631969452, -0.03575573116540909, -0.03199240192770958, -0.03306444734334946, -0.003918874077498913, 0.062194518744945526, 0.015179269947111607, -0.027334710583090782, -0.058873455971479416, 0.128275528550148, -0.0292880367487669, -0.07747887820005417, 0.1131230816245079, 0.02434738725423813, -0.025987306609749794, 0.006977062206715345, 0.005061171483248472, 0.010551988147199154, -0.011694980785250664, -0.04222672060132027, 0.0018857514951378107, -0.09771532565355301, 0.005980918649584055, -0.021874738857150078, -0.03269551321864128, -0.0660959854722023, -0.03511122986674309, -0.012204808183014393, -0.010394910350441933, 0.05620425567030907, -0.07928325980901718, 0.0231300238519907, -0.018796175718307495, -0.059483520686626434, -0.06498315185308456, -0.002720780670642853, 0.017449399456381798, -0.07902888208627701, -0.09885134547948837, 0.013462111353874207, 0.0991656631231308, 0.03312922269105911, -0.006249894388020039, 0.005173753947019577, -0.06332565099000931, -0.06398826092481613, -0.03855561092495918, 0.049685269594192505, 0.016197331249713898, -0.006844596937298775, -0.05894636735320091, 0.026065604761242867, -0.023921040818095207, 0.0833858922123909, 0.04180749133229256],
         "XCHG":[0.03013892099261284, -0.005918541457504034, -0.003877029987052083, -0.01153622567653656, 0.07044235616922379, 0.0020885420963168144, -0.04268760234117508, 0.07963797450065613, 0.0896378755569458, -0.03346250206232071, -0.026062551885843277, 0.07721738517284393, 0.08893758058547974, 0.0798523873090744, -0.025333784520626068, -0.01930663175880909, -0.012997916899621487, -0.051225848495960236, -0.0299966000020504, -0.032841041684150696, -0.06343690305948257, -0.016547048464417458, 0.034530773758888245, 0.057199425995349884, 0.0693645030260086, 0.04208416864275932, -0.028830133378505707, 0.08431533724069595, -0.06464798003435135, 0.0009512414690107107, 0.042868468910455704, -0.031348757445812225, -0.01816270686686039, 0.05597987025976181, -0.017707090824842453, -0.03889893740415573, -0.052769940346479416, 0.012921033427119255, -0.029488561674952507, -0.012502696365118027, 0.05398940294981003, -0.032147347927093506, -0.005250571761280298, -0.014250441454350948, 0.08205590397119522, 0.049281857907772064, -0.07257362455129623, -0.0003973407146986574, -0.00821124017238617, 0.10007432103157043, 0.054469816386699677, -0.05644146353006363, 0.013105852529406548, -0.08262810856103897, -0.02594495750963688, 0.007682343479245901, -0.011262120679020882, -0.007376475725322962, -0.05011703073978424, -0.06952987611293793, -0.033738043159246445, 0.01750120520591736, -0.026767224073410034, -0.04718783125281334, 0.002559647196903825, 0.01700885407626629, -0.07193762063980103, 0.07015261799097061, 0.0034866048954427242, -0.08257746696472168, -0.07703307271003723, 0.006709580775350332, 0.06423933804035187, 0.024792056530714035, -0.008637255057692528, 0.0364011712372303, 0.035330090671777725, -0.060980167239904404, 0.026977067813277245, -0.02813805267214775, -0.02690977416932583, 0.05637027323246002, 0.008040377870202065, -0.03371180593967438, -0.06654872000217438, -0.030922764912247658, -0.07050447911024094, 0.047597192227840424, 0.047301240265369415, 0.04565070942044258, -0.0005885852151550353, -0.01970672234892845, -0.013277091085910797, 0.03462797775864601, -0.050644565373659134, -6.830461643403396e-05, -0.0032834408339112997, -0.09096988290548325, -0.0431605726480484, 0.004180085379630327],
         "XOR":[0.05397406592965126, 0.030059566721320152, -0.008174624294042587, -0.015902524814009666, -0.05867229402065277, 0.10023067146539688, 0.039013586938381195, -0.0062194764614105225, 0.0027951474767178297, -0.12871405482292175, 0.006182669661939144, -0.03362947702407837, 0.03972288593649864, -0.0761077031493187, 0.07198456674814224, 0.06330277770757675, -0.020690103992819786, 0.04084693267941475, -0.029953323304653168, -0.1037738174200058, 0.058683767914772034, -0.09326515346765518, -0.030509043484926224, 0.08620086312294006, -0.028335779905319214, 0.0025649559684097767, 0.02293877862393856, 0.06309233605861664, 0.05537085980176926, 0.008650199510157108, 0.08450134843587875, 0.006163342390209436, 0.08676894754171371, 0.00373055599629879, -0.0536164715886116, 0.017478466033935547, -0.02005663886666298, -0.009954672306776047, 0.0935724526643753, -0.013202485628426075, 0.019175032153725624, 0.047811202704906464, -0.010279017500579357, 0.08613553643226624, 0.030951783061027527, -0.007498149760067463, 0.02222890406847, 0.022576699033379555, -0.037464242428541183, -0.05039561539888382, -0.05145428702235222, 0.05291113257408142, -0.04549814388155937, 0.07552238553762436, 0.04320567473769188, 0.08343681693077087, -0.03850278630852699, -0.01834949105978012, 0.047886237502098083, 0.00965320598334074, 0.014898041263222694, -0.06947735697031021, -0.002480468712747097, 0.033667247742414474, -0.057668499648571014, 0.038462892174720764, -0.04644528403878212, -0.06664751470088959, -0.048734813928604126, 0.04303475841879845, 0.027636554092168808, 0.024116700515151024, -0.003788548056036234, -0.0088395019993186, -0.04236738011240959, -0.02894027903676033, -0.135579451918602, -0.032144784927368164, -0.11316774785518646, -0.0039872839115560055, 0.07162772864103317, 0.03945969045162201, 0.007661669049412012, 0.04564569517970085, 0.023007070645689964, 0.0002026051515713334, -0.030437719076871872, -0.01982058770954609, -0.017619898542761803, -0.04013601690530777, 0.03464880958199501, -0.04437020793557167, 0.010373799130320549, -0.057255037128925323, -0.006371108815073967, -0.02713695913553238, -0.06605585664510727, 0.01780680939555168, -0.00013575045159086585, 0.07283638417720795]
+    },
+    "CommonOperands" : {   
+        "Immediate":[-0.039664868265390396, 0.028720445930957794, -0.057207897305488586, 0.04179477319121361, 0.04477043077349663, 0.020050648599863052, -0.056656818836927414, -0.025030966848134995, -0.04394019395112991, 0.04849115386605263, 0.012325904332101345, 0.06731707602739334, 0.04568001255393028, -0.04773757979273796, -0.012142524123191833, -0.03986259177327156, -0.027249159291386604, -0.04930245876312256, -0.10542229562997818, -0.05678592994809151, -0.038303568959236145, -0.07283245027065277, 0.0217409897595644, -0.01139344647526741, 0.006936497986316681, -0.04702157527208328, 0.09977010637521744, -0.035237088799476624, 0.028822069987654686, -0.0691431537270546, -0.0829710066318512, -0.1289154589176178, -0.08470306545495987, -0.06731563061475754, 0.06642980873584747, 0.026025734841823578, -0.04049745202064514, 0.030080674216151237, 0.04203929752111435, 0.06834205985069275, 0.04315062239766121, 0.00788890291005373, 0.03426999971270561, 0.08819636702537537, 0.004112098831683397, 0.03392210975289345, 0.010541473515331745, 0.08045777678489685, -0.02914009988307953, 0.0624285452067852, 0.03299122676253319, -0.05355033650994301, -0.07568570226430893, 0.08106201142072678, 0.0376802459359169, -0.04886564612388611, -0.10992937535047531, -0.00761816743761301, -0.014918084256350994, 0.03816765174269676, -0.04981819912791252, 0.00031993765151128173, 0.011382698081433773, -0.029902901500463486, -0.0117422454059124, -0.057965945452451706, -0.09519924223423004, 0.020727403461933136, -0.04526710882782936, 0.09883677959442139, 0.018033087253570557, -0.003035350237041712, -0.06968960911035538, -0.09893210977315903, -0.01264366414397955, 0.017397744581103325, -0.08519260585308075, 0.09382850676774979, -0.055508699268102646, -0.026548130437731743, -0.013868317008018494, -0.03162496164441109, 0.06089535728096962, -0.01583624631166458, -0.060260944068431854, 0.06709896773099899, -0.09333796799182892, -0.02887417934834957, -0.03424007445573807, -0.01687423326075077, 0.11968979239463806, -0.08361987769603729, 0.09037765115499496, -0.04322688281536102, -0.040831610560417175, -0.061376459896564484, -0.03485504537820816, 0.016033072024583817, 0.004106835462152958, -0.03354674205183983],
+        "MBB":[0.0285621527582407, 0.017540860921144485, -0.08473232388496399, -0.004012782592326403, 0.01284435298293829, -0.05268647149205208, 0.05576688051223755, 0.0021535248961299658, -0.03945871442556381, -0.006189210340380669, -0.015129411593079567, -0.08998296409845352, -0.023543253540992737, -0.03973307088017464, 0.03474939242005348, -0.01602775789797306, -0.07461361587047577, -0.016514597460627556, -0.016366377472877502, 0.004728052299469709, -0.023341577500104904, -0.0914730429649353, 0.030636735260486603, -0.03425632417201996, 0.03614623472094536, -0.007019295822829008, -0.0218521635979414, -0.015808485448360443, -0.05414801836013794, 0.029721688479185104, 0.09407073259353638, 0.029655681923031807, -0.005722714588046074, 0.08653672784566879, 0.01633341796696186, -0.07890991121530533, -0.07574641704559326, 0.013483843766152859, -0.0011275253491476178, -0.05623066797852516, -0.03096684440970421, -0.0019136210903525352, 0.005127475131303072, 0.005057196598500013, -0.008401975966989994, -0.0391613207757473, -0.0026145142037421465, 0.05342942103743553, 0.034099776297807693, 0.028928104788064957, -0.006105952430516481, -0.039190810173749924, 0.026784662157297134, -0.07679374516010284, -0.007475676946341991, -0.036650288850069046, 0.00774755235761404, 0.008984091691672802, -0.059830714017152786, 0.042310964316129684, 0.0681624785065651, -0.018189340829849243, -0.014816401526331902, -0.05541539564728737, -0.09348370134830475, 0.003691869555041194, -0.0010735570685938, -0.010131723247468472, -0.041050590574741364, -0.013792471028864384, -0.024337435141205788, 0.07526508718729019, 0.08163300901651382, -0.03508464992046356, -0.01681988686323166, -0.06734774261713028, -0.07656992971897125, -0.03866373747587204, 0.004544078838080168, 0.0585801787674427, -0.021823249757289886, -0.0610244981944561, -0.04469957575201988, -0.011089849285781384, -0.05069964751601219, -0.025694409385323524, -0.0670132040977478, 0.09616350382566452, 0.06308142840862274, -0.10543308407068253, 0.0023751568514853716, -0.06237253174185753, 0.05771911144256592, -0.06010056659579277, -0.016188565641641617, 0.009142348542809486, -0.014255198650062084, -0.02999819628894329, 0.00473234336823225, 0.03976761922240257],
+        "FrameIndex":[0.05219179764389992, -0.01926516741514206, -0.021848104894161224, -0.008528115227818489, 0.02989117242395878, -0.012461756356060505, -0.050973404198884964, 0.026713935658335686, 0.01968700997531414, -0.001058116089552641, 0.009182002395391464, 0.03877940773963928, 0.070717453956604, -0.0028735792730003595, 0.0528000183403492, -0.015265910886228085, 0.007753959856927395, 0.01596899703145027, -0.07933179289102554, -0.02578687109053135, 0.02417992427945137, -0.03462255373597145, 0.04385964199900627, 0.004388607107102871, 0.03716951236128807, 0.04064105078577995, 0.07711678743362427, 0.0068300217390060425, -0.05443308874964714, -0.010809220373630524, -0.03124961629509926, 0.004911563824862242, -0.09201066941022873, 0.051436200737953186, 0.015400445088744164, 0.07804328948259354, -0.02971532940864563, -0.0003241244703531265, -0.02131350338459015, -0.09173687547445297, -0.01707594096660614, 0.0025449323002249002, 0.08701702952384949, 0.10675988346338272, -0.05082142353057861, 0.021581847220659256, -0.04104776680469513, 0.08402986079454422, -0.06109907105565071, 0.015201682224869728, 0.04374992102384567, -0.028573378920555115, -0.07767742872238159, 0.07216905802488327, 0.020538095384836197, -0.01229778677225113, 0.003033912740647793, -0.0007747758063487709, -0.09185474365949631, -0.02851664461195469, -0.009441743604838848, 0.05500328913331032, -0.002983751241117716, -0.09198789298534393, -0.051319632679224014, -0.054626885801553726, -0.020108554512262344, 0.0010591084137558937, -0.009138713590800762, 0.07223176956176758, -0.022099260240793228, 0.016025206074118614, -0.05320229008793831, 0.025131219998002052, 0.06626036763191223, 0.07639450579881668, -0.027084894478321075, 0.06581225991249084, -0.017618829384446144, -0.03859466314315796, -0.03385398909449577, 0.018783841282129288, -0.0730312392115593, 0.06957981735467911, -0.03065340407192707, 0.020685074850916862, -0.05311165004968643, 0.09466810524463654, 0.00955914705991745, -0.013919183053076267, -0.05540250986814499, -0.03087283857166767, -0.009688221849501133, 0.016239993274211884, -0.012926830910146236, -0.027712060138583183, -0.06342892348766327, -0.011996395885944366, 0.05536693334579468, -0.04359230771660805],
+        "ConstantPoolIndex":[0.041396364569664, -0.032536957412958145, -0.01450332161039114, -0.006678386591374874, 0.058945223689079285, 0.02544882893562317, -0.03047209233045578, -0.07739393413066864, -0.09328317642211914, -0.01668739691376686, -0.024649402126669884, -0.0379607230424881, -0.11910244077444077, -0.020992999896407127, -0.007654233835637569, -0.005232746247202158, -0.05641235038638115, -0.030478237196803093, -0.11095637828111649, -0.029757868498563766, 0.007831704802811146, -0.06478779017925262, -0.029330771416425705, -0.016729608178138733, 0.016851121559739113, -0.08636923134326935, 0.09819734841585159, -0.06862954050302505, -0.054081980139017105, -0.11573795974254608, 0.025045182555913925, -0.045820001512765884, -0.03937136381864548, -0.0006095073185861111, 0.010480350814759731, 0.04263518005609512, -0.07309181243181229, 0.030367357656359673, 0.05174611508846283, -0.07616177201271057, 0.08458246290683746, -0.05704038590192795, -0.08539492636919022, -0.027642514556646347, -0.01617196388542652, 0.025178344920277596, 0.009598441421985626, -0.02391812391579151, -0.007018273696303368, 0.08220435678958893, 0.019317878410220146, -0.07800780981779099, 0.008812256157398224, -0.08796992152929306, -0.018406951799988747, 0.06285018473863602, 0.0247958917170763, -0.010797450318932533, 0.042904313653707504, 0.04307369515299797, 0.03591239079833031, 0.0318138487637043, -0.052741825580596924, -0.05960077419877052, 0.05289359390735626, -0.07335714250802994, -0.07966916263103485, 0.06509458273649216, -0.014078558422625065, 0.05966315418481827, -0.10191051661968231, 0.038503143936395645, 0.08414285629987717, -0.09167703986167908, -0.03125883638858795, 0.00029595239902846515, -0.05052953213453293, 0.06109768897294998, 0.027757229283452034, 0.07064288854598999, 0.025423981249332428, 0.04430470988154411, 0.006646708585321903, 0.011614424176514149, -0.058028463274240494, -0.026873555034399033, -0.045714568346738815, -0.009242760017514229, -0.08255617320537567, 0.03060135245323181, -0.019932182505726814, -0.07189206779003143, 0.01935136877000332, 0.05297813192009926, 0.004497232846915722, -0.08383949100971222, -0.0008196682319976389, 0.03524069860577583, 0.023135961964726448, 0.00863903108984232],
+        "JumpTableIndex":[-0.007416237145662308, 0.0038157713133841753, 0.05180662125349045, 0.03776901960372925, -0.011749244295060635, -0.02952706068754196, -0.06646136939525604, 0.02088487148284912, -0.001927916775457561, 0.018895410001277924, 0.0509350448846817, 0.057210080325603485, -0.0476078987121582, -0.00016809302906040102, -0.02341553010046482, -0.06734820455312729, 0.02047930844128132, 0.009282611310482025, 0.0038133300840854645, 0.0020261742174625397, -0.09253961592912674, 0.0766557827591896, -0.049570225179195404, -0.11510220915079117, -0.009570423513650894, -0.007274465169757605, 0.07750000059604645, 0.02489926479756832, -0.08297400176525116, 0.048176445066928864, 0.03797437995672226, 0.060842450708150864, 0.020265065133571625, -0.03559373319149017, 0.03493893891572952, -0.0036544676404446363, 0.010211148299276829, -0.06471849977970123, -0.034595828503370285, -0.05245388671755791, -0.0014119939878582954, 0.008752748370170593, -0.020637203007936478, 0.053244929760694504, 0.052053239196538925, 0.014706660993397236, 0.02803724631667137, -0.07983336597681046, 0.03106858767569065, 0.001688914722763002, -0.07647732645273209, -0.028148295357823372, -0.0528123639523983, 0.08006428182125092, -0.06398879736661911, -0.033476538956165314, 0.05217607319355011, -0.03093232959508896, 0.044230975210666656, 0.05123162269592285, -0.05225585401058197, 0.06976816058158875, -0.0014492797199636698, 0.03833283483982086, 0.08385992050170898, -0.04722217097878456, -0.00226160092279315, -0.027254855260252953, -0.09566919505596161, 0.02109321765601635, -0.032354824244976044, 0.08032239973545074, -0.046937450766563416, -0.004326784983277321, -0.026024870574474335, 0.12039119750261307, 0.1016048863530159, 0.06808122247457504, -0.012297546491026878, -0.06450799852609634, 0.015778351575136185, 0.012280710972845554, 0.04002666845917702, 0.04792468994855881, -0.06248988211154938, -0.054222140461206436, 0.018379682675004005, -0.0029111658222973347, 0.016062958166003227, 0.09880068898200989, 0.03846307471394539, 0.04975416138768196, 0.07305088639259338, -0.020941948518157005, -0.020897891372442245, 0.03872328996658325, -0.05682756006717682, 0.09583723545074463, 0.0028475294820964336, -0.05127262324094772],
+        "ExternalSymbol":[0.014755810610949993, -0.049842361360788345, -0.06733497977256775, 0.05401315540075302, 0.061938412487506866, 0.02437831088900566, -0.06823863834142685, 0.03685877099633217, 0.02961423434317112, -0.04944299906492233, -0.1271103173494339, 0.030452819541096687, 0.019848955795168877, -0.03185190260410309, 0.06586895883083344, 0.0007315169204957783, 0.010839227586984634, -0.09547370672225952, -0.01799146644771099, -0.02204788289964199, 0.048699937760829926, 0.004187166225165129, 0.004053634125739336, -0.04464051127433777, -0.005158414598554373, -0.0416896678507328, -0.024279240518808365, -0.05358913540840149, -0.04719633609056473, -0.07180647552013397, 0.02559211477637291, 0.04657098650932312, 0.08353757858276367, -0.0023563469294458628, 0.046847302466630936, -0.03508693352341652, 0.0696689784526825, 0.054716791957616806, -0.012037037871778011, 0.019885245710611343, 0.01824580691754818, -0.06719563156366348, -0.05447190999984741, 0.08877509087324142, -0.01375679112970829, -0.014463561587035656, -0.049798283725976944, 0.06304343044757843, -0.007584648672491312, -0.016156170517206192, 0.024602508172392845, 0.004940119571983814, -0.04088609293103218, 0.0026271860115230083, 0.00787595845758915, -0.01889132149517536, -0.041029710322618484, 0.07343143969774246, -0.02505693957209587, -0.04825644940137863, 0.060728199779987335, 0.00460366066545248, 0.020744791254401207, 0.04238201677799225, -0.024090539664030075, -0.05792662873864174, 0.07639332860708237, -0.07511764764785767, -0.08259762078523636, 0.07901840656995773, -0.000285966758383438, 0.021390466019511223, -0.07818973809480667, -0.02385067008435726, -0.0014113716315478086, -0.055170729756355286, 0.00946732610464096, 0.02471417747437954, 0.07941421121358871, 0.006746167317032814, -0.06766024231910706, -0.089698426425457, 0.01933225803077221, -0.06994582712650299, -0.10149082541465759, 0.06007266044616699, -0.14545120298862457, -0.03447172790765762, 0.03258124738931656, 0.04966919496655464, 0.023691890761256218, -0.014501980505883694, 0.05896589905023575, 0.04760534316301346, -0.017742110416293144, 0.0019451226107776165, -0.01854461058974266, -0.04744676500558853, -0.017504630610346794, 0.05197983980178833],
+        "GlobalAddress":[0.021709734573960304, -0.03253590315580368, -0.04603651538491249, -0.02350226789712906, 0.02841794677078724, 0.01920732669532299, 0.053104616701602936, 0.03941836208105087, -0.01895466446876526, -0.030471740290522575, 0.010719750076532364, 0.020050356164574623, 0.03648754581809044, -0.021573888137936592, -0.02554452419281006, -3.637039117165841e-05, 0.05989491194486618, -0.006903402041643858, -0.08826262503862381, -0.028047384694218636, -0.04230065643787384, -0.05190899223089218, 0.06145390123128891, 0.0005839569494128227, -4.391977927298285e-05, -0.01880771853029728, 0.09660127758979797, 0.04333353415131569, 0.06461602449417114, -0.06010710820555687, -0.0690189078450203, 0.04574553668498993, -0.07640431076288223, 0.01879746839404106, 0.02076675370335579, 0.04869573190808296, 0.025147439911961555, 0.05311164632439613, 0.05711919441819191, 0.049520380795001984, 0.041169121861457825, -0.0603964701294899, -0.04195070639252663, 0.07676130533218384, -0.015161959454417229, 0.02903268299996853, -0.027548301964998245, 0.04705912992358208, -0.11194053292274475, -0.008245207369327545, -0.07792827486991882, -0.019468743354082108, 0.05482499673962593, -0.0028855702839791775, 0.05478052794933319, 0.07484771311283112, -0.011742575094103813, 0.00923923309892416, -0.05074375122785568, 0.06956734508275986, -0.045990440994501114, 0.007280972320586443, 0.040920473635196686, -0.09143709391355515, -0.06105270981788635, -0.0021254979074001312, -0.09519167989492416, 0.06324268877506256, -0.0693386048078537, -0.05100148543715477, 0.010643817484378815, -0.008162467740476131, -0.08811189234256744, -0.08640385419130325, 0.0077143507078289986, 0.030832089483737946, -0.01504515577107668, 0.07277517020702362, 0.02581198327243328, -0.052599068731069565, -0.06478387117385864, 0.01634707674384117, -0.021173706278204918, 0.030482977628707886, -0.09826494008302689, 0.07716016471385956, -0.10845024883747101, 0.04479274898767471, -0.015128640457987785, -0.03491876646876335, 0.05239150673151016, -0.03427724912762642, 0.06768845021724701, -0.04174086079001427, -0.05136744678020477, 0.0037109211552888155, -0.030324269086122513, -0.06928850710391998, -0.0395960658788681, 0.07726000994443893],
+        "RegisterMask":[0.009287647902965546, 0.029691029340028763, -0.03465871885418892, 0.032606374472379684, -0.007339544594287872, 0.03367740660905838, -0.0661492720246315, 0.0436118021607399, -0.002896533813327551, 0.028440887108445168, -0.06791415065526962, 0.004055356606841087, -0.01596181094646454, -0.003846745239570737, 0.06762582808732986, -0.025632556527853012, 0.08132420480251312, 0.025554664433002472, -0.08994632959365845, 0.02521730400621891, 0.023826507851481438, 0.0004487193073146045, 0.01047397032380104, 0.03246957063674927, -0.033482909202575684, 0.05051224306225777, 0.005778896156698465, -0.0006257061613723636, 0.00522293895483017, -0.04666636884212494, 0.022335125133395195, -0.022150320932269096, 0.04510439187288284, -0.02769547514617443, 0.026804683730006218, 0.0710473507642746, -0.014513042755424976, 0.0695318952202797, 0.048469461500644684, -0.008654370903968811, -0.028613079339265823, -0.02918054349720478, -0.022721733897924423, -0.0004791628452949226, 0.011470172554254532, 0.08561886101961136, 0.07125027477741241, -0.05847848951816559, 0.011811288073658943, -0.025244031101465225, -0.03665035218000412, -0.03482883796095848, 0.04196881502866745, 0.06909161061048508, 0.02365143597126007, -0.0689089447259903, -0.0707414448261261, -0.03962424397468567, -0.025703679770231247, 0.06502455472946167, 0.057676125317811966, 0.026916807517409325, 0.024921152740716934, 0.009799988009035587, -0.018656229600310326, 0.009880480356514454, -0.06516153365373611, 0.019290866330266, 0.02236226759850979, -0.02598695270717144, -0.00299705658107996, 0.019448822364211082, -0.014883329160511494, 0.06645222008228302, -0.028751512989401817, -0.01589173451066017, 0.026225939393043518, 0.07285763323307037, -0.06037987396121025, -0.027615630999207497, -0.039930179715156555, -0.07122864574193954, 0.029825787991285324, 0.026364129036664963, -0.04438399150967598, 0.07015394419431686, -0.013950555585324764, 0.004367176443338394, 0.020521124824881554, 0.02030497044324875, 0.011951270513236523, 0.06765977293252945, -0.015042259357869625, 0.005189584568142891, -0.07532864063978195, -0.010886142030358315, 0.006792030762881041, -0.06348442286252975, 0.031859394162893295, -0.052482619881629944],
+        "Metadata":[-0.07879140228033066, 0.024690961465239525, 0.022790303453803062, 0.01354144886136055, -0.07098772376775742, 0.04053819552063942, -0.04038544371724129, -0.021055836230516434, 0.10361373424530029, 0.04415135458111763, -0.09545262902975082, 0.042553599923849106, -0.021835647523403168, 0.07703430950641632, -0.04880501329898834, -0.04054124280810356, 0.05049756169319153, 0.08986796438694, 0.0705084353685379, -0.0077315340749919415, -0.045390889048576355, 0.053155045956373215, 0.045656319707632065, -0.02663712576031685, -0.01446426473557949, -0.058978915214538574, 0.011314704082906246, 0.03043927252292633, -0.0843580812215805, 0.017854437232017517, -0.08720997720956802, 0.030351335182785988, -0.04896129295229912, 0.04189978539943695, -0.09887325763702393, 0.0015409664483740926, -0.08604399859905243, 0.10654544085264206, 0.1058540865778923, 0.014106648042798042, 0.0640459656715393, -0.05182884633541107, 0.006081609521061182, 0.07624028623104095, 0.02025698497891426, 0.08467324078083038, 0.027136018499732018, 0.026320911943912506, -0.035337720066308975, 0.03864980861544609, -0.019960917532444, -0.029152821749448776, 0.06562864780426025, 0.028298277407884598, -0.07397148013114929, -0.005078969523310661, 0.025909438729286194, -0.01157586183398962, 0.05436081811785698, 0.03408071771264076, -0.07142144441604614, -0.0523630827665329, -0.06302442401647568, -0.019975490868091583, -0.06937523931264877, 0.057667043060064316, -0.08580337464809418, -0.05092239752411842, -0.012613813392817974, 0.025480754673480988, 0.04219530522823334, -0.007300581783056259, 0.05323299020528793, 0.0489904023706913, 0.09260626882314682, -0.04819458723068237, 0.05419271066784859, 0.04558999091386795, 0.012036344967782497, -0.05483977124094963, -0.05181310698390007, -0.02104383148252964, -0.057876624166965485, 0.039601441472768784, 0.025240536779165268, -0.03984035924077034, 0.07654847204685211, -0.07073183357715607, -0.0018080074805766344, -0.016453349962830544, 0.03962434455752373, 0.05717255175113678, 0.01962372660636902, 0.00952839944511652, 0.0013127806596457958, 0.013634574599564075, 0.07692103832960129, 0.06334574520587921, 0.056647684425115585, -0.02965259924530983],
+        "MCSymbol":[0.05158298835158348, 0.05024643987417221, 0.06704410910606384, 0.0378347709774971, -0.03902719169855118, -0.08626251667737961, 0.03964311257004738, 0.06615762412548065, 0.04361319541931152, 0.03646374121308327, -0.018487416207790375, 0.0024993624538183212, 0.006693041883409023, 0.08311881870031357, 0.021111667156219482, 0.038208797574043274, 0.08689694851636887, -0.03659898787736893, 0.020775076001882553, 0.03553535416722298, 0.06854367256164551, -0.002012243028730154, 0.03658154606819153, 0.03127564862370491, 0.0363621786236763, -0.027205800637602806, -0.05243372917175293, 0.012564878910779953, -0.013430594466626644, -0.04043225944042206, -0.025083716958761215, 0.09665156900882721, 0.005077417939901352, -0.05181048810482025, 0.08925056457519531, 0.0777667909860611, -0.013708796352148056, 0.07754126191139221, 0.08393577486276627, 0.06395212560892105, -0.07428556680679321, -0.052424050867557526, 0.03497577831149101, 0.01964585855603218, -0.0429445318877697, 0.07072066515684128, 0.0017074055504053831, 0.059513408690690994, 0.013262910768389702, -0.07240563631057739, 0.09288764744997025, 0.030620144680142403, -0.046197980642318726, 0.04847298562526703, -0.03942957893013954, -0.0025783153250813484, -0.019526517018675804, 0.038867682218551636, 0.006007499527186155, -0.06366054713726044, 0.004640159662812948, 0.013837787322700024, -0.020015377551317215, -0.010317903012037277, 0.001741019543260336, 0.06261103600263596, -0.03374830260872841, 0.01629183441400528, -0.013137640431523323, 0.026046304032206535, -0.009679407812654972, -0.07085473090410233, 0.03035539574921131, -0.08764562010765076, -0.03820766881108284, -0.04181021824479103, -0.05163294076919556, 0.06666433811187744, -0.08939782530069351, 0.040260378271341324, -0.06847432255744934, 0.09106951206922531, -0.07388591021299362, -0.07479099184274673, -0.001779694459401071, -0.0963745042681694, -0.06515862792730331, -0.08404017239809036, -0.09935544431209564, 0.010541093535721302, -0.04491754248738289, 0.09378639608621597, 0.006655062548816204, 0.06637217849493027, -0.05623293295502663, -0.020134123042225838, 0.005873391404747963, -0.07765494287014008, -0.0008442706312052906, -0.03568055108189583]
+    },
+    "VirtualRegisters" : {
+        "VIRT_REG_FR32":[0.0034248235169798136, -0.011980761773884296, -0.0501178540289402, 0.0494888611137867, 0.06103336811065674, -0.06178610771894455, 0.007709897588938475, -0.011392943561077118, 0.06570645421743393, 0.0771368145942688, 0.0005577280535362661, 0.013396150432527065, -0.041660163551568985, 0.05122360959649086, 0.11354377865791321, -0.009875510819256306, -0.06466709822416306, 0.048170577734708786, 0.0007201629341579974, 0.06538223475217819, 0.08870227634906769, -0.05771782249212265, 0.009273379109799862, -0.03325295075774193, 0.01197165809571743, 0.06604835391044617, 0.08265330642461777, -0.005758166313171387, 0.02512396313250065, 0.03383670747280121, 0.038484204560518265, -0.06539343297481537, -0.013461028225719929, 0.001498897559940815, 0.05170154944062233, 0.06965786963701248, -0.07339458167552948, 0.05094756931066513, 0.01983451284468174, -0.06855696439743042, 0.07892709225416183, 0.06099703162908554, 0.08492864668369293, 0.05357863008975983, -0.009294840507209301, -0.0054923719726502895, -0.029938997700810432, 0.028260599821805954, 0.053790509700775146, -0.06574371457099915, -0.009621666744351387, -0.08131514489650726, -0.08474338054656982, 0.039622966200113297, 0.06945627927780151, 0.02545306645333767, 0.005390701815485954, 0.04582791030406952, -0.1103447750210762, -0.050917647778987885, 0.03087870217859745, 0.06918162852525711, 0.0548822283744812, -0.01838473603129387, 0.05597897991538048, 0.03548860549926758, -0.009931124746799469, -0.07856663316488266, 0.033994875848293304, 0.03467561677098274, 0.09580692648887634, -0.04153195023536682, -0.06732118874788284, -0.06857144832611084, 0.03419093042612076, -0.01200241968035698, -0.06983492523431778, 0.05929506942629814, -0.00041734304977580905, -0.026396293193101883, 0.05230500176548958, -0.006162640172988176, 0.044198282063007355, -0.028765834867954254, 0.031155114993453026, 0.06967037916183472, -0.0892564132809639, 0.028816571459174156, -0.037065472453832626, 0.06540130823850632, -0.01888667233288288, 0.030632384121418, 0.0359313078224659, 0.106044240295887, 0.03259910270571709, -0.0775517001748085, -0.04267778620123863, 0.04977935180068016, -0.01790289767086506, -0.11223265528678894],
+        "VIRT_REG_FR64":[0.08496882021427155, 0.049308884888887405, -0.016840212047100067, 0.010602951049804688, -4.6025739720789716e-05, -0.06524767726659775, 0.048670798540115356, -0.06444543600082397, -0.0031944462098181248, 0.05608433857560158, -0.03958145156502724, 0.05171080678701401, -0.03572545200586319, -0.054364755749702454, 0.052311528474092484, -0.0361458919942379, 0.024109655991196632, 0.15923210978507996, -0.07255382835865021, -0.011799084022641182, -0.06846465915441513, 0.0023571476340293884, 0.02642918936908245, -0.05057685822248459, 0.029800178483128548, -0.06036723777651787, -0.012272411957383156, -0.022802220657467842, -0.02426644042134285, 0.05623406544327736, -0.07506053894758224, -0.02078152634203434, 0.02549685165286064, -0.030025657266378403, -0.0627482682466507, 0.062375299632549286, 0.03684084117412567, 0.06365678459405899, 0.0004415051080286503, -0.002180535811930895, 0.05225013941526413, -0.0693102702498436, -0.03649357333779335, 0.005159272346645594, -0.03298519179224968, 0.041419681161642075, -0.05325934663414955, -0.017585784196853638, -0.03843431547284126, -0.002649943344295025, 0.033329058438539505, -0.04736043140292168, -0.043852102011442184, -0.06713785231113434, -0.03237355872988701, 0.012679073959589005, -0.01959240809082985, 0.07324203103780746, 0.07468831539154053, 0.03327644243836403, -0.01596391387283802, 0.12015434354543686, 0.051839299499988556, 0.00980563648045063, -0.08275608718395233, 0.04445798322558403, -0.03891860321164131, 0.10891054570674896, -0.008730625733733177, -0.051655255258083344, -0.05982912331819534, 0.04106972739100456, 0.06872759014368057, 0.013289053924381733, 0.03469584137201309, -0.06673429906368256, -0.0695682018995285, 0.047426726669073105, 0.02815094031393528, -0.05552271753549576, 0.0010567272547632456, -0.051840681582689285, -0.01704293303191662, -0.047185055911540985, 0.036965738981962204, 0.03452568128705025, -0.05430837720632553, 0.0383443646132946, 0.0003438846324570477, -0.030417989939451218, 0.02749026007950306, -0.0546082966029644, 0.03005768544971943, 0.0025131346192210913, 0.0013019279576838017, -0.054173994809389114, -0.008382225409150124, 0.02153395675122738, 0.011912085115909576, -0.10461334884166718],
+        "VIRT_REG_GR16":[0.09543223679065704, 0.03513967618346214, 0.08986528217792511, -0.012217407114803791, -0.02076001651585102, -0.04190119728446007, 0.01318269595503807, -0.010142332874238491, -0.011869532987475395, -0.040446147322654724, 0.06552371382713318, 0.04439055174589157, 0.08176156878471375, -0.06334159523248672, -0.033928077667951584, -0.00024628525716252625, 0.0244551170617342, -0.019419007003307343, -0.09592454880475998, 0.005961012560874224, 0.03278326243162155, -0.07028506696224213, -0.08484592288732529, -6.329250754788518e-05, 0.015018146485090256, -0.05068608745932579, 0.0732998326420784, 0.023434389382600784, 0.0002124009479302913, 0.060401707887649536, 0.013626078143715858, -0.010556582361459732, -0.005069760140031576, -0.004616749472916126, -0.034329116344451904, 0.060584329068660736, -0.05430089309811592, -0.029179023578763008, 0.042385730892419815, -0.0652197003364563, 0.09378205984830856, -0.05090794339776039, -0.008510591462254524, 0.0837036669254303, 0.009071480482816696, 0.04464874789118767, -0.012855015695095062, 0.06306030601263046, -0.08556588739156723, -0.05393703281879425, -0.06741822510957718, -0.03717748448252678, 0.017156923189759254, 0.07401604950428009, -0.06629005819559097, -0.04564857482910156, -0.055414989590644836, 0.039407771080732346, -0.04089723527431488, 0.06915309280157089, 0.030190052464604378, 0.027542876079678535, 0.03557966649532318, 0.05191207677125931, -0.03237364813685417, -0.02036256715655327, -0.071859210729599, -0.06704329699277878, 0.0336633175611496, 0.09511569887399673, 0.0048662531189620495, 0.05273270234465599, -0.056247059255838394, 0.06079721450805664, -0.04150049015879631, -0.08104457706212997, -0.10303051024675369, 0.04522428661584854, -0.04379847273230553, -0.019447194412350655, 0.0021319733932614326, -0.010465282015502453, 0.06857019662857056, -0.00443653529509902, -0.08039603382349014, -0.05012141168117523, 0.0875077098608017, -0.03053239732980728, -0.05321606993675232, 0.016501901671290398, -0.0563507042825222, -0.03187479078769684, -0.0015389680629596114, 0.022985411807894707, -0.05008963868021965, 0.028300117701292038, 0.02875804342329502, -0.024458128958940506, -0.022238614037632942, -0.049835607409477234],
+        "VIRT_REG_GR32":[-0.008479167707264423, -0.02941126376390457, 0.05343153327703476, 0.03769504278898239, -0.0006716987118124962, -0.0329299233853817, 0.03442851081490517, -0.06826753169298172, -0.09117511659860611, -0.018657755106687546, 0.029032904654741287, 0.02404048666357994, 0.010598761960864067, -0.0482308566570282, 0.06956348568201065, -0.027967501431703568, -0.07380961626768112, -0.021098148077726364, -0.0808446854352951, 0.0127912862226367, -0.01355082169175148, -0.040285225957632065, 0.035385165363550186, -0.001157263875938952, -0.026462145149707794, -0.08616211265325546, -0.044482193887233734, -0.010969695635139942, 0.04645564407110214, -0.018178211525082588, -0.038536932319402695, -0.027571648359298706, -0.007523007690906525, -0.02699458785355091, -0.039170436561107635, 0.12889482080936432, -0.04512789845466614, -0.03883056715130806, 0.051210880279541016, 0.03924906626343727, 0.036943964660167694, -0.016879307106137276, 0.011263007298111916, 0.053573690354824066, -0.018964825198054314, -0.041856080293655396, -0.036545924842357635, 0.07715532928705215, -0.041981130838394165, -0.04114629328250885, -0.04393022507429123, -0.030163627117872238, 0.0019487979589030147, 0.10988762229681015, 0.09039165079593658, -0.0035424421075731516, -0.06272851675748825, 0.007701062131673098, -0.01971622183918953, 0.06203003600239754, 0.048561323434114456, -0.04599940404295921, 0.00802221056073904, -0.002905400237068534, -0.1050020381808281, 0.003395768813788891, -0.07973644882440567, 0.008020970039069653, -0.08614815771579742, 0.0518532320857048, 0.021174483001232147, 0.03254232555627823, -0.01905026100575924, -0.0009989180834963918, -0.06409642845392227, -0.022425753995776176, -0.03563409671187401, 0.07717793434858322, -0.04553033784031868, -0.02112392708659172, -0.002374667674303055, 0.03828585892915726, -0.014221777208149433, -0.015974245965480804, -0.01805220916867256, 0.04202109947800636, -0.0841534212231636, 0.06608130037784576, -0.11586519330739975, 0.024179989472031593, 0.017091574147343636, 0.08567194640636444, -0.03692129999399185, 0.03266705200076103, -0.046154942363500595, 0.0040525165386497974, -0.03177625685930252, 0.039895471185445786, 0.042960215359926224, -0.05573953315615654],
+        "VIRT_REG_GR32_ABCD":[0.016604775562882423, -0.0028934956062585115, 0.041060179471969604, -0.025077441707253456, -0.018642406910657883, 0.023762650787830353, -0.028646549209952354, -0.02460283786058426, 0.005985732190310955, 0.01774146780371666, -0.004014404024928808, -0.05473850294947624, -0.0417158380150795, -0.06322457641363144, 0.060795728117227554, -0.036435071378946304, -0.04245952516794205, 0.08069344609975815, 0.035319335758686066, -0.012020719237625599, 0.045771341770887375, -0.10842540860176086, 0.046253710985183716, -0.004099135287106037, 0.030616935342550278, -0.08288344740867615, 0.08569363504648209, -0.014164377935230732, -0.004303323570638895, 0.09726760536432266, 0.06208871304988861, -0.04007713496685028, 0.005815347656607628, 0.02377200312912464, 0.07813961058855057, 0.03192306309938431, -0.006230524741113186, 0.10110925883054733, -0.023409254848957062, 0.030774405226111412, -0.011607645079493523, -0.03929119184613228, 0.004817614797502756, -0.013827506452798843, 0.07770339399576187, -0.07994075864553452, -0.03157062083482742, 0.06743781268596649, 0.014881699346005917, -0.030165214091539383, -0.07844353467226028, -0.04563238099217415, 0.09747181832790375, 0.057128582149744034, 0.04173563793301582, -0.0011194447288289666, -0.01902887038886547, -0.032171595841646194, 0.04824799671769142, 0.008433254435658455, 0.024706291034817696, 0.0746094286441803, 0.04515853151679039, -0.0018984260968863964, -0.10070884972810745, -0.01883143000304699, -0.07785795629024506, 0.10938235372304916, -0.08001448959112167, -0.07419873028993607, 0.010544849559664726, 0.025767439976334572, -0.1005895584821701, 0.05103800818324089, -0.03675306960940361, -0.020510872825980186, 0.022482097148895264, 0.06463642418384552, -0.03149804100394249, -0.021647030487656593, 0.04025804623961449, 0.003628256032243371, 0.03532547131180763, -0.08667688816785812, 0.018817460164427757, -0.01690257526934147, -0.10114696621894836, -0.022815177217125893, 0.024386661127209663, 0.10286301374435425, 0.030005114153027534, 0.0370776504278183, -0.008584428578615189, -0.077603779733181, -0.03588058054447174, 0.030617419630289078, -0.07383710891008377, 0.03215676173567772, 0.03288266062736511, -0.036702848970890045],
+        "VIRT_REG_GR32_NOREX":[0.019052108749747276, -0.006784944795072079, -0.05410394072532654, 0.001966317882761359, -0.06686867773532867, 0.013514372520148754, 0.030097918584942818, -0.03868359327316284, 0.004314934369176626, -0.06713679432868958, 0.02491898462176323, 0.027683967724442482, 0.035907283425331116, -0.023093875497579575, -0.0892200842499733, -0.1052003800868988, -0.03923499956727028, 0.08808581531047821, -0.10092058777809143, 0.03336786851286888, -0.08974049985408783, -0.015254802070558071, 0.039686985313892365, -0.010083628818392754, -0.03423550724983215, -0.08821681141853333, -0.05621311068534851, -0.020327769219875336, -0.016793876886367798, 0.08908043801784515, -0.04112761467695236, -0.050139520317316055, -0.01524045504629612, 0.05841142684221268, 0.08270087838172913, 0.0348736047744751, -0.016146546229720116, 0.05751227214932442, 0.05081859603524208, -0.07304663956165314, -0.047101784497499466, -0.02825125865638256, 0.0006340605323202908, 0.0008785317186266184, -0.044239338487386703, 0.007173972204327583, -0.029449066147208214, 0.07254412025213242, -0.026029080152511597, 0.025982191786170006, -0.09524690359830856, -0.052613094449043274, -0.1270490437746048, 0.05319184809923172, 0.1046818196773529, 0.0477570965886116, -0.06291303783655167, 0.04725426062941551, -0.05330964922904968, 0.04056742787361145, 0.01543382927775383, 0.03627128154039383, -0.048232536762952805, 0.014761016704142094, -0.007380587514489889, -0.008060632273554802, -0.021923277527093887, -0.022500980645418167, -0.08495079725980759, 0.045358967036008835, -0.04728720709681511, 0.03550735488533974, 0.03445536270737648, -0.01891610585153103, -0.09439470618963242, -0.044266197830438614, -0.07952893525362015, 0.05221104994416237, -0.03507477045059204, 0.04218391329050064, 0.040326621383428574, -0.0395088866353035, 0.02447870559990406, -0.04280063137412071, 0.06520935893058777, -0.003358252113685012, -0.057561881840229034, 0.01911463774740696, 0.05295571684837341, 0.030342884361743927, 0.03814920783042908, -0.03366788476705551, 0.03090745024383068, 0.09487249702215195, -0.002995486371219158, -0.012020634487271309, -0.029147809371352196, 0.09558248519897461, 0.02548893168568611, 0.0931544378399849],
+        "VIRT_REG_GR64":[0.02717440389096737, -0.026730243116617203, -0.023244258016347885, 0.04027782380580902, 0.006808254402130842, -0.027519788593053818, -0.01906559243798256, 0.027793627232313156, -0.00129543652292341, -0.03455121070146561, 0.021734628826379776, 0.035481199622154236, -0.07251942157745361, -0.025691546499729156, -0.03271827474236488, -0.13225725293159485, -0.0601421520113945, 0.09084498882293701, -0.10225717723369598, 0.004034099169075489, 0.023578351363539696, -0.041603971272706985, 0.04199974611401558, -0.014711204916238785, -0.04272732138633728, -0.12534455955028534, -0.023738788440823555, 0.005328727886080742, 0.038416482508182526, -0.026419155299663544, -0.041119154542684555, 0.00022502713545691222, -0.05204978585243225, -0.019709734246134758, -0.04102563485503197, 0.06480151414871216, 0.009224721230566502, 0.04627599939703941, 0.027821402996778488, -0.05595114827156067, 0.04526345059275627, 0.024196594953536987, 0.10446277260780334, 0.07561361789703369, -0.08028160035610199, -0.0314163975417614, 0.11944323033094406, 0.1025814488530159, -0.08457476645708084, 0.02227119728922844, -0.041679076850414276, -0.02260834351181984, 0.036674268543720245, 0.10488750785589218, 0.019218411296606064, -0.015966340899467468, -0.06852715462446213, 0.026523491367697716, -0.11090730130672455, -0.0021082640159875154, -0.048291631042957306, -0.032388005405664444, 0.015713853761553764, 0.03355225548148155, -0.06502845883369446, -0.010098783299326897, -0.09930021315813065, -0.017413528636097908, -0.055861033499240875, 0.0801810696721077, -0.03900628536939621, -0.03278445452451706, -0.0337282195687294, -0.11434067040681839, -0.04371264949440956, -0.01736009307205677, -0.05100121721625328, 0.07490750402212143, -0.014680330641567707, -0.02126181870698929, 0.018013890832662582, 0.0018135658465325832, 0.029781077057123184, -0.012477489188313484, -0.021443217992782593, 0.047576501965522766, -0.05993758141994476, -0.06040889024734497, 0.016642581671476364, 0.011624492704868317, -0.042229063808918, -0.007573941722512245, -0.04010608047246933, -0.006444427650421858, -0.014495199546217918, -0.04122597724199295, -0.08505907654762268, -0.004049300216138363, 0.06545045226812363, -0.04762336611747742],
+        "VIRT_REG_GR64_ABCD":[0.04577033221721649, -0.07758746296167374, 0.00799313560128212, -0.11011485010385513, -0.010862522758543491, 0.012709266506135464, 0.05257265642285347, -0.07354705780744553, 0.04262387007474899, 0.07554348558187485, -0.06358839571475983, 0.006669520866125822, 0.049098193645477295, 0.11183933168649673, -0.028112098574638367, 0.021986473351716995, -0.02839403599500656, -0.06199958547949791, 0.08614487200975418, -0.041216861456632614, 0.041238460689783096, 0.005937385838478804, 0.00200703926384449, -0.05337367579340935, 0.037919919937849045, -0.07485998421907425, -0.09153831005096436, -0.0554175041615963, -0.10251995176076889, -0.01289951242506504, -0.030631467700004578, 0.04197017475962639, -0.03578301519155502, 0.010593005456030369, -0.05836241692304611, 0.06809061765670776, 0.10871735960245132, -0.09833388775587082, -0.009873395785689354, -0.056898634880781174, 0.05946199968457222, 0.015534073114395142, 0.01677171140909195, -0.020233800634741783, -0.006396631710231304, -0.049332089722156525, 0.012649210169911385, 0.03756912052631378, 0.0033660116605460644, -0.09084216505289078, -0.07142844051122665, -0.0030346515122801065, 0.0019640070386230946, 0.038837920874357224, 0.011760945431888103, 0.04995080456137657, -0.06997165083885193, -0.035297296941280365, 0.01996617764234543, 0.01954355463385582, -0.0934600979089737, 0.030165065079927444, -0.007337240036576986, -0.05346155911684036, 0.0732186883687973, -0.04716489836573601, -0.06555212289094925, -0.018465254455804825, 0.051119767129421234, -0.03106619231402874, 0.0748852789402008, -0.02095886692404747, 0.006320921704173088, 0.03146332502365112, -0.08238139003515244, -0.03618254140019417, -0.014570276252925396, 0.062481846660375595, -0.0394093319773674, -0.05171547457575798, -0.044726233929395676, -0.01228095218539238, 0.09699232876300812, 0.07471026480197906, 0.03112417459487915, 0.022543631494045258, -0.08634103089570999, 0.059702761471271515, -0.013801504857838154, 0.004984616301953793, 0.045798566192388535, -0.03205988556146622, -0.06150995194911957, -0.02244667150080204, 0.03318532556295395, 0.03462471440434456, 0.03236381709575653, 0.0884014293551445, -0.01604369841516018, -0.05234146490693092],
+        "VIRT_REG_GR64_NOREX":[-0.03959479182958603, -0.06190898269414902, -0.02920372597873211, -0.09973344951868057, -0.004333901684731245, -0.08522991091012955, 0.0459987074136734, -0.057674553245306015, 0.037046968936920166, -0.05669403821229935, -0.02221340872347355, -0.062426190823316574, 0.05804889276623726, -0.02635439857840538, -0.045627325773239136, 0.03632078319787979, 0.07128578424453735, 0.07544906437397003, -0.0537678524851799, -0.04624016210436821, 0.014316501095890999, 0.05580946058034897, 0.05251356214284897, -0.08244197070598602, -0.08901460468769073, -0.07641059905290604, -0.04924754425883293, 0.05417120084166527, -0.0060508353635668755, -0.00814742036163807, -0.06154030188918114, 0.05966867506504059, -0.03231468051671982, 0.021429890766739845, 0.031103987246751785, 0.04343251883983612, -0.08997714519500732, 0.039365898817777634, 0.052908625453710556, -0.02683917060494423, -0.05547752603888512, -0.014131218194961548, 0.0016863569617271423, -0.041112788021564484, -0.010230163112282753, -0.06687774509191513, -0.006144971586763859, -0.08074352145195007, 0.04034091532230377, -0.08176303654909134, -0.004055786412209272, -0.0024839320685714483, -0.007289807312190533, 0.06915127485990524, 0.023709064349532127, 0.04671626538038254, 0.06229325756430626, 0.04707597941160202, 0.06800796836614609, -0.02885584905743599, 0.030613983049988747, -0.019083039835095406, 0.045457858592271805, 0.040770504623651505, -0.05441175401210785, -0.05712401866912842, 0.07744520157575607, -0.0756613239645958, -0.06890957802534103, -0.07997069507837296, 0.09348486363887787, -0.04511028528213501, 0.036194607615470886, 0.040017660707235336, 0.016245214268565178, 0.023104460909962654, 0.058383163064718246, 0.0679842159152031, -0.00921112485229969, -0.10036550462245941, 0.09075804799795151, -0.059704095125198364, -0.013338442891836166, -0.005139742512255907, 0.07807526737451553, 0.06255412846803665, -0.008151572197675705, -0.0624256506562233, 0.012590888887643814, 0.03665084019303322, -0.028498578816652298, -0.01614067517220974, 0.007552243769168854, -0.007216903381049633, 0.0760180801153183, -0.04200543463230133, 0.06412865966558456, -0.05136435106396675, -0.0024792966432869434, 0.06856651604175568],
+        "VIRT_REG_GR64_NOREX_NOSP":[-0.0656895712018013, 0.058077458292245865, -0.006653467658907175, 0.037784356623888016, 0.07274001836776733, 0.07232078164815903, 0.07074914127588272, 0.05637859180569649, 0.04296007752418518, 0.05499762296676636, -0.01783664897084236, -0.08387365937232971, -0.01376343984156847, -0.07938199490308762, -0.027822256088256836, -0.0663403570652008, 0.036170270293951035, -0.07460261881351471, 0.08652043342590332, 0.02483147382736206, -0.07939319312572479, 0.033202506601810455, 0.0903514102101326, -0.10181311517953873, 0.060751549899578094, 0.07619930803775787, 0.05017509311437607, -0.0470910519361496, 0.07713821530342102, -0.0426195003092289, -0.04506472498178482, 0.003363420255482197, -0.0017315347213298082, 0.06264199316501617, 0.005245774984359741, -0.027923958376049995, 0.09868567436933517, 0.06738796830177307, -0.10339145362377167, 0.0020383980590850115, 0.087734155356884, 0.011040030047297478, -0.05993311479687691, -0.05790332704782486, 0.01574312523007393, 0.009771298617124557, 0.022676382213830948, -0.009197148494422436, 0.03372732177376747, 0.08404259383678436, -0.015135225839912891, -0.04693703353404999, 0.09917140752077103, 0.007134507410228252, 0.020209072157740593, -0.00027669535484164953, -0.0351635180413723, 0.03751315921545029, -0.019665181636810303, 0.028500953689217567, 0.034186746925115585, -0.005931361112743616, 0.05645192414522171, -0.02027188241481781, -0.022675039246678352, -0.08812297880649567, -0.014896178618073463, -0.048788342624902725, 0.008708382956683636, 0.019917558878660202, -0.002275944221764803, 0.03409638628363609, 0.033304013311862946, 0.057676300406455994, 0.039842985570430756, -0.025169866159558296, 0.016520975157618523, -0.030201178044080734, -0.021718870848417282, -0.07023277878761292, -0.007528252899646759, 0.009067370556294918, -0.0460657961666584, 0.07117785513401031, -0.03609836474061012, -0.011893372051417828, -0.006047600414603949, 0.0179970171302557, 0.024480223655700684, -0.03918423503637314, 0.004897980485111475, 0.05040167644619942, 0.010113563388586044, -0.1074901670217514, -0.06277655810117722, -0.02934161201119423, -0.06922926008701324, -0.05638887360692024, 0.05314395949244499, 0.04588884115219116],
+        "VIRT_REG_GR64_NOSP":[0.0015277941711246967, -0.03938478231430054, -0.030811766162514687, 0.027071669697761536, 0.02127140760421753, 0.0015787228476256132, -0.07842491567134857, 0.004658385645598173, -0.05909501388669014, -0.03576778993010521, -0.07251477241516113, 0.12117832154035568, 0.04499363154172897, -0.009405314922332764, -0.01015283353626728, -0.002841090550646186, 0.0689091831445694, 0.10697457194328308, -0.09274765104055405, -0.027955353260040283, -0.0379958301782608, -0.044126156717538834, 0.04907212778925896, -0.038063473999500275, -0.003686746582388878, -0.08313410729169846, -0.045181579887866974, -0.011702840216457844, -0.006579228211194277, 0.046807315200567245, -0.045654296875, -0.03466613590717316, -0.08313826471567154, -0.06678880006074905, -0.027727074921131134, 0.036734677851200104, -0.040936414152383804, 0.05170389637351036, 0.038199927657842636, 0.02960256300866604, 0.0355701707303524, -0.02052776888012886, 0.06218089163303375, 0.10570456087589264, -0.036479029804468155, -0.008999336510896683, -0.031860992312431335, 0.07250168174505234, -0.061084795743227005, -0.057996805757284164, -0.010533110238611698, -0.018169214949011803, 0.017261315137147903, 0.10023517906665802, -0.044131457805633545, -0.07618662714958191, -0.09124933928251266, 0.01819406822323799, -0.05906827375292778, 0.04295642301440239, -0.03197735920548439, 0.03641442954540253, 0.005168464966118336, -0.00010972691961796954, -0.0829579159617424, -0.014677388593554497, -0.08750011026859283, -0.04695136100053787, -0.07696729153394699, -0.00718996487557888, 0.018294518813490868, -0.014321570284664631, -0.04416860267519951, -0.0890057235956192, -0.014466283842921257, 0.02831638976931572, -0.04845190420746803, 0.08228176832199097, 0.03420877829194069, 0.056510377675294876, 0.037403274327516556, 0.04364967346191406, 0.08903267979621887, -0.016827082261443138, -0.0682789757847786, 0.06286796927452087, -0.0958203598856926, 0.018489282578229904, 0.02886355295777321, 0.028006011620163918, 0.039986785501241684, -0.04771937429904938, -0.004648604430258274, 0.033939141780138016, -0.027820419520139694, -0.026187442243099213, -0.07972361892461777, 0.006323353853076696, 0.016448041424155235, -0.01961681991815567],
+        "VIRT_REG_GR64_NOSP_and_GR64_TC":[0.08079065382480621, -0.05147358775138855, -0.08338657021522522, 0.06757336109876633, -0.015237463638186455, 0.026806311681866646, 0.07564966380596161, -0.037159934639930725, -0.02222878858447075, -0.04553138092160225, -0.006632891017943621, 0.001604291144758463, 0.043711669743061066, 0.0710049569606781, -0.08854726701974869, -0.03142566233873367, -0.0865127220749855, 0.08521236479282379, 0.039203498512506485, 0.04737624153494835, 0.02893459051847458, 0.004120660945773125, 0.03552098199725151, -0.0010448878165334463, 0.04423774778842926, 0.03258584439754486, 0.03433830663561821, -0.019990455359220505, -0.03263172507286072, 0.09782663732767105, -0.00702365068718791, -0.06544602662324905, 0.013447105884552002, 0.04603038728237152, 0.029931804165244102, 0.0988783910870552, -0.062023941427469254, -0.0070026409812271595, 0.032557111233472824, -0.08212000876665115, 0.03199682757258415, 0.020828546956181526, 0.07071725279092789, -0.018812179565429688, -0.0184739138931036, -0.06008931249380112, 0.01504000648856163, -0.019235603511333466, 0.014653048478066921, -0.009083813987672329, 0.03171474114060402, 0.019499456509947777, 0.05263463407754898, 0.10554639250040054, -0.02759619802236557, -0.00156494346447289, -0.03898271545767784, 0.06027846410870552, -0.061001915484666824, 0.039365388453006744, -0.06546281278133392, 0.0006352368509396911, 0.0500405877828598, -0.03232716768980026, -0.010176514275372028, 0.002549059921875596, 0.0666508674621582, -0.037290267646312714, -0.028836704790592194, 0.06271649152040482, -0.016647985205054283, 0.013602355495095253, 0.020110899582505226, 0.011730309575796127, -0.10071564465761185, -0.06239647418260574, -0.09507977962493896, -0.09190725535154343, -0.08861985802650452, -0.0006123466300778091, 0.0951915979385376, -0.035364676266908646, -0.04007220268249512, 0.08415472507476807, 0.0006664254469797015, 0.05864431709051132, 0.01460045762360096, -0.09507087618112564, 0.024228032678365707, 0.04208158329129219, 0.006106846500188112, 0.09294755011796951, 0.06157369166612625, 0.0826527327299118, -0.058974966406822205, -0.09958664327859879, 0.06913749873638153, -0.08108915388584137, 0.07425157725811005, 0.04784728214144707],
+        "VIRT_REG_GR64_TC":[-0.0944172665476799, 0.040403831750154495, -0.017597073689103127, 0.04766053333878517, -0.03104357235133648, 0.025751160457730293, 0.036779265850782394, -0.0235747080296278, 0.032111138105392456, 0.009872193448245525, -0.01596468687057495, 0.05234881862998009, -0.047335200011730194, 0.005157034378498793, -0.02132921665906906, -0.0544377863407135, 0.057515472173690796, -0.006743279751390219, -0.01474941335618496, -0.0990658849477768, 0.022418741136789322, -0.007098495960235596, 0.046933863312006, 0.1002131924033165, 0.01583809033036232, 0.03995800018310547, -0.017743254080414772, -0.01684877835214138, 0.06543229520320892, 0.04597911611199379, 0.05365373566746712, -0.008774830959737301, -0.01341968309134245, -0.004754040390253067, 0.04739849269390106, 0.032378777861595154, -0.0020728895906358957, 0.03502136841416359, 0.05946416035294533, -0.06190952658653259, 0.01910495012998581, -0.023678753525018692, 0.012653682380914688, -0.06766874343156815, -0.0729866623878479, 0.0757005363702774, -0.027033904567360878, -0.06776778399944305, -0.010131776332855225, -0.06334701925516129, -0.04702980816364288, 0.06837917864322662, 0.002726735547184944, 0.04345812648534775, 0.04288078844547272, -0.06921732425689697, -0.07625382393598557, 0.037991974502801895, -0.04257906600832939, 0.06338586658239365, 0.05315309390425682, -0.02785014547407627, 0.04054750129580498, 0.06967299431562424, -0.07271680235862732, 0.0032969408202916384, -0.08254148811101913, 0.07269596308469772, -0.01827111467719078, 0.034775473177433014, 0.010106234811246395, 0.0389409065246582, 0.042805008590221405, -0.03822058066725731, 0.0668339803814888, -0.005216705612838268, -0.00022202919353730977, -0.0221820380538702, -0.027401722967624664, -0.045061662793159485, -0.05296671763062477, -0.0190189890563488, -0.002744461875408888, -0.04073096439242363, -0.06974441558122635, 0.05868958309292793, -0.06907399743795395, -0.026619713753461838, 0.015318086370825768, 0.035948701202869415, -0.08301021158695221, 0.03955607861280441, 0.028369972482323647, 0.0202812347561121, -0.12075140327215195, -0.039504438638687134, -0.03826067969202995, 0.01607581228017807, 0.02135113812983036, -0.08897850662469864],
+        "VIRT_REG_GR64_TC_with_sub_8bit":[0.00805664248764515, 0.06228634715080261, -0.005148644559085369, -0.025605352595448494, -0.04853198677301407, -0.018169978633522987, 0.008530518971383572, -0.1050964742898941, -0.08428415656089783, -0.014802628196775913, 0.05918573588132858, 0.07529161125421524, 0.09815273433923721, -0.014188972301781178, 0.06676790118217468, 0.09496084600687027, -0.03843621164560318, -0.00740150036290288, -0.11988909542560577, -0.01781499572098255, -0.03719411790370941, -0.07447166740894318, 0.005513608455657959, -0.014381160028278828, 0.036786310374736786, -0.04839075356721878, -0.009440913796424866, 0.03984222561120987, -0.08096668124198914, 0.026751000434160233, 0.06400448083877563, 0.07998895645141602, 2.295125523232855e-05, 0.0266779325902462, -0.0030931613873690367, 0.05236855521798134, -0.010479471646249294, -0.011119752191007137, -0.06124376133084297, -0.019449712708592415, 0.03448517248034477, -0.04095051810145378, 0.01377212442457676, 0.09643338620662689, 0.021325431764125824, 0.06029453128576279, 0.048866767436265945, -0.03436344116926193, -0.043422505259513855, 0.03822150453925133, 0.004718889016658068, -0.04090931639075279, -0.04219569265842438, 0.019032739102840424, 0.06111171841621399, 0.04305591061711311, -0.0379939004778862, -0.03224434703588486, -0.06517905741930008, 0.002272483194246888, 0.09273418039083481, -0.028145847842097282, 0.01824336126446724, 0.00936606340110302, -0.07281909137964249, -0.028650810942053795, -0.060721538960933685, -0.09477518498897552, -0.0014060320099815726, 0.06919887661933899, -0.03463669493794441, 0.0026504716370254755, -0.0653621107339859, -0.02800566703081131, -0.02503957599401474, -0.060285311192274094, 0.014794053509831429, -0.08424058556556702, 0.0482206828892231, -0.07467620074748993, -0.09909844398498535, -0.06888734549283981, -0.0014173799427226186, -0.09022543579339981, 0.06461413204669952, 0.024526789784431458, -0.07400602847337723, -0.008816084824502468, 0.025513656437397003, 0.047476526349782944, -0.05981749668717384, 0.08338218182325363, 0.02657591737806797, 0.03547860309481621, -0.043622229248285294, 0.10129662603139877, 0.08802521973848343, -0.09759330749511719, 0.025680232793092728, 0.05964493378996849],
+        "VIRT_REG_GR64_with_sub_16bit_in_GR16_NOREX":[-0.03117012232542038, -0.02872271090745926, -0.039712607860565186, 0.03738812729716301, 0.030099159106612206, 0.00013636364019475877, -0.019107641652226448, -0.04186702147126198, -0.053099144250154495, -0.020432034507393837, -0.0004185919533483684, 0.010934959165751934, 0.036054231226444244, 0.03788067027926445, 0.05227302014827728, -0.034505825489759445, -0.08298061788082123, 0.0399160161614418, 0.03668724000453949, 0.014606554992496967, -0.0071771652437746525, 0.059049926698207855, -0.06330917030572891, 0.007379058748483658, -0.0750177726149559, -0.0423760749399662, -0.019386067986488342, -0.018436923623085022, -0.015116279944777489, 0.023602722212672234, 0.0533282607793808, -0.026401247829198837, 0.023750485852360725, -0.027648568153381348, -0.016443056985735893, 0.04291580244898796, -0.04391908273100853, 0.05113501846790314, -0.03743087872862816, 0.056367188692092896, 0.048130668699741364, -0.0230261143296957, 0.03358393907546997, -0.030188169330358505, 0.08421863615512848, 0.0033821314573287964, 0.03151029348373413, -0.042818162590265274, 0.04007953777909279, -0.0050337472930550575, 0.03335743024945259, -0.026563530787825584, 0.016440672799944878, -0.04272226244211197, -0.07304228097200394, 0.024836458265781403, -0.016342775896191597, -0.055494848638772964, -0.05826134234666824, 0.027478834614157677, 0.025981346145272255, -0.04745938256382942, 0.013695796020328999, -0.027888784185051918, 0.03769542649388313, -0.024486247450113297, 0.04720773920416832, -0.012697651982307434, -0.03559652715921402, 0.012948199175298214, -0.025600459426641464, 0.014954420737922192, -0.06651762872934341, 0.04277091473340988, -0.08291683346033096, 0.016881149262189865, 0.04145864024758339, -0.04162050038576126, -0.03363965451717377, -0.05018439516425133, 0.06321889907121658, -0.00871780700981617, 0.06867428869009018, 0.057975344359874725, 0.009704249911010265, 0.049075234681367874, -0.06111253425478935, 0.027943406254053116, 0.03725599870085716, 0.032480716705322266, -0.01960119605064392, -0.0295172780752182, 0.014026675373315811, 0.056797921657562256, -0.031707022339105606, 0.0010152219329029322, -0.023705823346972466, -0.07695567607879639, 0.017504720017313957, -0.0020094760693609715],
+        "VIRT_REG_GR64_with_sub_8bit":[-0.011493992060422897, -0.027181852608919144, 0.022013556212186813, 0.05687474459409714, -0.03289574757218361, -0.04803529754281044, -0.04204253479838371, 0.044671084731817245, -0.0849028080701828, -0.09561576694250107, 0.03596775606274605, 0.027156801894307137, 0.05034027621150017, -0.006308000069111586, 0.012393618933856487, -0.048590339720249176, -0.049129705876111984, 0.059305012226104736, -0.10330235958099365, 0.00738809397444129, 0.03855152800679207, -0.03220852091908455, 0.05221837759017944, -0.01274650078266859, 0.024303985759615898, -0.05925533175468445, -0.015623844228684902, -0.025864524766802788, 0.009918035939335823, 0.004779431037604809, -0.02866589091718197, 0.006512579973787069, -0.037251196801662445, 0.005028596147894859, -0.011677909642457962, 0.051886074244976044, -0.03552602231502533, 0.011968757025897503, 0.00829426757991314, -0.06981230527162552, -0.029781555756926537, -0.012621275149285793, 0.08595969527959824, 0.08630531281232834, 0.10018875449895859, -0.054863955825567245, -0.044519901275634766, 0.0893385037779808, 0.04004377871751785, 0.003711731405928731, -0.021447300910949707, -0.08500636368989944, 0.0037281641270965338, 0.14561010897159576, 0.03993009030818939, 0.07621612399816513, 0.020513180643320084, 0.004926605150103569, -0.035578932613134384, 0.06101486086845398, -0.08422145247459412, -0.03511432558298111, 0.01537742093205452, -0.010146304965019226, -0.05133780837059021, -0.010472903028130531, -0.09726933389902115, -0.010570867918431759, -0.09348491579294205, 0.002129049738869071, -0.01265127956867218, 0.03504374623298645, -0.008679943159222603, -0.002507386729121208, -0.06586045026779175, -0.04775359109044075, -0.042809367179870605, 0.08359787613153458, -0.0230431966483593, -0.015440763905644417, 0.0195400882512331, -0.0186530202627182, -0.03176320344209671, -0.019522372633218765, -0.02984560839831829, 0.024256182834506035, -0.07656785100698471, 0.03944750130176544, 0.016559945419430733, 0.007124909665435553, 0.08061631768941879, 0.08561833202838898, -0.018525447696447372, -0.0019649232272058725, -0.018469924107193947, -0.012311050668358803, -0.08448101580142975, 0.060216110199689865, 0.06368701905012131, -0.07110093533992767],
+        "VIRT_REG_GR8":[0.02255251444876194, 0.012649326585233212, 0.05363747105002403, -0.006129346787929535, 0.027027001604437828, 0.03703385218977928, -0.045294541865587234, -0.02489621751010418, 0.026587747037410736, -0.06228360906243324, 0.01547946222126484, 0.03494448587298393, 0.08276952058076859, -0.03470698744058609, 0.0036826131399720907, 0.04216131567955017, -0.04518325626850128, 0.09584730118513107, -0.09126991778612137, -0.11293632537126541, 0.0141398124396801, -0.05086163431406021, 0.0421922467648983, -0.0001364851341350004, 0.05821910500526428, -0.04154132679104805, 0.036521218717098236, -0.016718950122594833, 0.0773339569568634, 0.05134757608175278, -0.03728386387228966, -0.014684299007058144, 0.016949277371168137, 0.025767508894205093, -0.01573120802640915, 0.0343811996281147, 0.008209497667849064, 0.0011038129450753331, -0.06688684970140457, -0.08167136460542679, 0.03875276446342468, 0.08301592618227005, 0.023012684658169746, 0.07135005295276642, 0.008461466059088707, 0.004998552612960339, 0.02622731775045395, -0.09479465335607529, 0.014987453818321228, -0.008574756793677807, -0.008050303906202316, -0.005560623947530985, 0.04616820812225342, 0.11537269502878189, 0.032199542969465256, 0.05507092550396919, -0.053164780139923096, 0.012255114503204823, -0.01981479674577713, 0.06012535095214844, 0.043957680463790894, 0.02384384348988533, 0.04837791621685028, 0.04945961385965347, -0.1063770279288292, -0.07354240119457245, -0.08922741562128067, -0.026019031181931496, -0.08768662065267563, 0.09241457283496857, 0.03253300115466118, -0.018267929553985596, -0.04406850412487984, -0.05577726289629936, -0.05304105579853058, 0.016035545617341995, 0.05610279366374016, 0.06247573718428612, -0.019430609419941902, -0.017088554799556732, -0.022114543244242668, 0.07442588359117508, -0.017668865621089935, -0.02403153106570244, 0.006919574458152056, 0.05879344418644905, -0.0885634645819664, -0.016336753964424133, -0.024662213400006294, 0.029266972094774246, -0.04889025166630745, 0.042460259050130844, -0.013102580793201923, 0.023992935195565224, 0.024768078699707985, 0.047551900148391724, -0.02243787795305252, 0.05929713696241379, 0.03110451251268387, -0.00550821190699935],
+        "VIRT_REG_RFP80":[-0.04414765536785126, 0.05147779360413551, -0.035608600825071335, -0.03939598798751831, 0.0430026613175869, -0.03331028297543526, 0.015591064468026161, 0.01892651617527008, -0.011428372003138065, -0.06980786472558975, 0.06445881724357605, 0.1036338210105896, 0.01164929661899805, -0.07599718868732452, 0.022036561742424965, 0.10396245121955872, -0.041171155869960785, -0.07264886051416397, 0.00032837275648489594, 0.02848120965063572, -0.031889040023088455, 0.023848745971918106, -0.02298046089708805, -0.05559201166033745, 0.026687605306506157, 0.0565699003636837, -0.0134252505376935, 0.05494402348995209, -0.0584089457988739, 0.05422470346093178, -0.024360226467251778, 0.03570455685257912, 0.013681530021131039, -0.006910417694598436, 0.011886067688465118, 0.07619262486696243, 0.08147607743740082, 0.05824091285467148, 0.001224246108904481, -0.030463339760899544, -0.023527851328253746, 0.03078501485288143, -0.02225799672305584, -0.058049511164426804, 0.015403151512145996, 0.07900431007146835, 0.025944147258996964, 0.021455328911542892, 0.023985104635357857, -0.0327906534075737, 0.04195002466440201, -0.10313323140144348, -0.023333510383963585, -0.010316243395209312, -0.02042137086391449, 0.07474000751972198, 0.02313513681292534, -0.0030733307357877493, 0.06138097122311592, 0.005197131074965, -0.03222955763339996, 0.005364845506846905, -0.05313501134514809, 0.0013082564109936357, 0.025044983252882957, 0.0349799208343029, 0.09704083949327469, -0.017403649166226387, -0.03375721350312233, 0.05970870703458786, -0.021679691970348358, -0.04719642922282219, 0.024217652156949043, -0.06130526587367058, 0.004813425708562136, 0.07473690062761307, -0.039600174874067307, -0.009295261465013027, 0.05440402403473854, 0.04785943776369095, -0.04006686061620712, -0.020133933052420616, 0.00989031046628952, -0.054447200149297714, 0.06291327625513077, -0.01196430902928114, 0.0841275230050087, -0.05557875707745552, -0.0813804343342781, -0.0746457576751709, -0.024255990982055664, -0.048101916909217834, -0.014132879674434662, -0.013147399760782719, -0.009715595282614231, 0.08717820793390274, -0.04318689927458763, -0.0311901792883873, -0.017253845930099487, 0.005144816357642412],
+        "VIRT_REG_VR128":[0.08292517066001892, 0.053138989955186844, 0.0019234063802286983, -0.030035940930247307, 0.0821828693151474, -0.0540342852473259, 0.06449387222528458, -0.03985493257641792, 0.026820721104741096, 0.0352952741086483, -0.1056072935461998, 0.054804764688014984, 0.01685425080358982, 0.05867069214582443, 0.11665259301662445, -0.07655566930770874, 0.021201618015766144, 0.00927705504000187, -0.04723019897937775, 0.016582123935222626, -0.01160470675677061, -0.013075411319732666, 0.01054342370480299, -0.05403316020965576, 0.033609066158533096, -0.07971179485321045, 0.1005927175283432, -0.020655132830142975, -0.0036442605778574944, 0.018269486725330353, 0.036334097385406494, -0.06517180055379868, -0.028530113399028778, -0.03768114373087883, 0.10582506656646729, 0.011199450120329857, -0.06707775592803955, 0.02332702837884426, -0.014528930187225342, -0.09369251132011414, 0.069722481071949, 0.031001657247543335, 0.08032777905464172, -0.060744334012269974, 0.015131807886064053, 0.01935953088104725, -0.087028868496418, 0.041773099452257156, 0.0381581112742424, -0.07518653571605682, 0.021307995542883873, -0.07350508868694305, -0.04699733853340149, -0.007377162110060453, 0.07836157828569412, 0.016066696494817734, -0.02160775288939476, -0.030519334599375725, -0.09255059063434601, 0.03597188740968704, -0.11260625720024109, -0.08602424710988998, 0.058293748646974564, -0.034749604761600494, 0.005541469436138868, -0.07924741506576538, -0.024103455245494843, 0.06047135218977928, 0.026729481294751167, 0.03493977710604668, -0.07453227788209915, -0.01716521382331848, 0.008985077030956745, -0.08075122535228729, 0.03353623300790787, -0.08125714957714081, 0.04245763644576073, 0.06520543247461319, 0.020550349727272987, -0.003161275526508689, -0.03491697832942009, -0.005496494937688112, 0.09021904319524765, -0.057418785989284515, 0.03494826331734657, -0.052578359842300415, -0.044952504336833954, 0.11770184338092804, -0.048565153032541275, -0.03815764561295509, 0.06020108237862587, -0.09397949278354645, 0.03820547088980675, 0.08039405196905136, 0.014751153998076916, 0.006572262849658728, 0.05658692866563797, 0.05043925344944, -0.0060436660423874855, -0.12018798291683197],
+        "VIRT_REG_VR256":[0.032775089144706726, 0.029240285977721214, 0.01821955479681492, 0.023595772683620453, -0.02587016113102436, -0.12190376222133636, 0.09720813482999802, 0.005780891049653292, -0.0581410676240921, 0.04817686229944229, -0.04627984017133713, 0.03618951886892319, -0.10393846780061722, 0.04380590096116066, 0.030101926997303963, -0.021811308339238167, 0.0012455569813027978, 0.06209835410118103, -0.08859474956989288, 0.0671553835272789, -0.006448917090892792, 0.0169842429459095, 0.031113164499402046, -0.07417412847280502, 0.05549546331167221, -0.013042094185948372, 0.0948401540517807, -0.07335975021123886, -0.03987044095993042, -0.005343804135918617, -0.08741248399019241, -0.08009110391139984, 0.005667346995323896, 0.03745159134268761, 0.019986214116215706, -0.03723142296075821, -0.0037649653386324644, 0.005682446528226137, 0.0659727230668068, -0.002658356446772814, 0.07049102336168289, -0.01944110542535782, -0.014278342947363853, 0.04189611226320267, 0.0312303826212883, -0.046760618686676025, 0.040438465774059296, 0.054074693471193314, 0.07479880005121231, -0.016405146569013596, 0.027125591412186623, -0.04216836765408516, 0.0011189498472958803, -0.01471384521573782, -0.010250975377857685, -0.006412460468709469, -0.12170380353927612, 0.015495882369577885, -0.054699406027793884, 0.05955614894628525, 0.06753991544246674, -0.03688138723373413, 0.049010518938302994, -0.07614680379629135, 0.06504888087511063, -0.014145595952868462, 0.02210555598139763, 0.023598313331604004, 0.00511248828843236, 0.013318972662091255, -0.11605404317378998, -0.032067783176898956, -0.05010659247636795, -0.023693162947893143, 0.06650379300117493, -0.026386691257357597, 0.06052805855870247, 0.0515507273375988, 0.033960308879613876, -0.06421340256929398, -0.09355985373258591, -0.0658700093626976, 0.10278744995594025, -0.10271084308624268, -0.012089421041309834, -0.04169749841094017, -0.07112454622983932, -0.032573599368333817, -0.0003141233173664659, 0.017007946968078613, 0.03622191399335861, 0.05829676240682602, 0.06261610984802246, 0.005667738616466522, 0.009631159715354443, 0.022852277383208275, 0.057013869285583496, -0.05015721917152405, 0.027599012479186058, -0.08637165278196335]
+    },
+    "PhysicalRegisters" : {
+        "PHY_REG":[-0.008169060572981834, -0.017023155465722084, -0.04927198588848114, 0.0014261528849601746, 0.012259463779628277, -0.02794509381055832, -0.024857040494680405, 0.029203711077570915, 0.0433109886944294, 0.009679347276687622, -0.05811547115445137, -0.09075025469064713, -0.08525611460208893, -0.10545054078102112, 0.06474080681800842, 0.056396666914224625, 0.06781823933124542, 0.09059076011180878, -0.10420752316713333, -0.08284831047058105, 0.02349182404577732, -0.0354253351688385, -0.004627702757716179, 0.0068538435734808445, -0.053724177181720734, -0.02113335393369198, 0.05254676192998886, -0.050769440829753876, 0.061386119574308395, -0.07541731745004654, -0.024204161018133163, -0.0009893826209008694, -0.007493770215660334, -0.017051052302122116, 0.015025814063847065, -0.020427946001291275, -0.0844966471195221, 0.04589429497718811, 0.025571472942829132, -0.05280151963233948, 0.06895384937524796, 0.03960262984037399, 0.0068003153428435326, 0.09397424012422562, -0.0523529127240181, 0.03780638054013252, -0.015423302538692951, 0.029167350381612778, 0.01019437238574028, 0.023989612236618996, -0.03344425559043884, -0.07926471531391144, -0.09238854795694351, 0.04794330149888992, 0.01872367039322853, -0.029179377481341362, -0.05339968949556351, -0.04575541242957115, -0.004491546656936407, -0.009650425054132938, 0.026945313438773155, -0.02115861512720585, 0.06488905847072601, -0.06647083908319473, 0.008904196321964264, 0.010536684654653072, -0.06012551859021187, -0.00022655133216176182, -0.10175421833992004, 0.062001921236515045, -0.054452817887067795, 0.01785552129149437, -0.06749527156352997, -0.04883178323507309, -0.023449009284377098, 0.040745027363300323, 0.002448269398882985, 0.07842953503131866, -0.019806355237960815, -0.08275315910577774, 0.01131721492856741, 0.0482926219701767, 0.01892486959695816, 0.005685009527951479, -0.0055344682186841965, -0.0034555341117084026, -0.07923021167516708, 0.06387833505868912, 0.05978211387991905, -0.001252106623724103, 0.07216084003448486, -0.01223798282444477, 0.09716741740703583, 0.009659498929977417, -0.09404221922159195, -0.10122949630022049, -0.003581057768315077, 0.07885389029979706, 0.05305042862892151, -0.04988719895482063]
     }
 }
 \ No newline at end of file
diff --git a/llvm/lib/AsmParser/AsmParserContext.cpp b/llvm/lib/AsmParser/AsmParserContext.cpp
new file mode 100644
index 0000000..59d3ffc
--- /dev/null
+++ b/llvm/lib/AsmParser/AsmParserContext.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/AsmParser/AsmParserContext.h"
+
+namespace llvm {
+
+std::optional<FileLocRange>
+AsmParserContext::getFunctionLocation(const Function *F) const {
+  if (auto FIt = Functions.find(F); FIt != Functions.end())
+    return FIt->second;
+  return std::nullopt;
+}
+
+std::optional<FileLocRange>
+AsmParserContext::getBlockLocation(const BasicBlock *BB) const {
+  if (auto BBIt = Blocks.find(BB); BBIt != Blocks.end())
+    return BBIt->second;
+  return std::nullopt;
+}
+
+std::optional<FileLocRange>
+AsmParserContext::getInstructionLocation(const Instruction *I) const {
+  if (auto IIt = Instructions.find(I); IIt != Instructions.end())
+    return IIt->second;
+  return std::nullopt;
+}
+
+Function *
+AsmParserContext::getFunctionAtLocation(const FileLocRange &Query) const {
+  for (auto &[F, Loc] : Functions) {
+    if (Loc.contains(Query))
+      return F;
+  }
+  return nullptr;
+}
+
+Function *AsmParserContext::getFunctionAtLocation(const FileLoc &Query) const {
+  return getFunctionAtLocation(FileLocRange(Query, Query));
+}
+
+BasicBlock *
+AsmParserContext::getBlockAtLocation(const FileLocRange &Query) const {
+  for (auto &[BB, Loc] : Blocks) {
+    if (Loc.contains(Query))
+      return BB;
+  }
+  return nullptr;
+}
+
+BasicBlock *AsmParserContext::getBlockAtLocation(const FileLoc &Query) const {
+  return getBlockAtLocation(FileLocRange(Query, Query));
+}
+
+Instruction *
+AsmParserContext::getInstructionAtLocation(const FileLocRange &Query) const {
+  for (auto &[I, Loc] : Instructions) {
+    if (Loc.contains(Query))
+      return I;
+  }
+  return nullptr;
+}
+
+Instruction *
+AsmParserContext::getInstructionAtLocation(const FileLoc &Query) const {
+  return getInstructionAtLocation(FileLocRange(Query, Query));
+}
+
+bool AsmParserContext::addFunctionLocation(Function *F,
+                                           const FileLocRange &Loc) {
+  return Functions.insert({F, Loc}).second;
+}
+
+bool AsmParserContext::addBlockLocation(BasicBlock *BB,
+                                        const FileLocRange &Loc) {
+  return Blocks.insert({BB, Loc}).second;
+}
+
+bool AsmParserContext::addInstructionLocation(Instruction *I,
+                                              const FileLocRange &Loc) {
+  return Instructions.insert({I, Loc}).second;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/AsmParser/CMakeLists.txt b/llvm/lib/AsmParser/CMakeLists.txt
index 20d0c50..dcfcc06 100644
--- a/llvm/lib/AsmParser/CMakeLists.txt
+++ b/llvm/lib/AsmParser/CMakeLists.txt
@@ -1,5 +1,6 @@
 # AsmParser
 add_llvm_component_library(LLVMAsmParser
+  AsmParserContext.cpp
   LLLexer.cpp
   LLParser.cpp
   Parser.cpp
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 50d1d47..7a6c19e 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -191,6 +191,8 @@ int LLLexer::getNextChar() {
 }
 
 lltok::Kind LLLexer::LexToken() {
+  // Set token end to next location, since the end is exclusive.
+  PrevTokEnd = CurPtr;
   while (true) {
     TokStart = CurPtr;
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index cf63285..5164cec 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -451,6 +451,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   UpgradeModuleFlags(*M);
   UpgradeNVVMAnnotations(*M);
   UpgradeSectionAttributes(*M);
+  copyModuleAttrToFunctions(*M);
 
   if (!Slots)
     return false;
@@ -751,14 +752,21 @@ bool LLParser::parseDeclare() {
 ///   ::= 'define' FunctionHeader (!dbg !56)* '{' ...
 bool LLParser::parseDefine() {
   assert(Lex.getKind() == lltok::kw_define);
+  FileLoc FunctionStart(Lex.getTokLineColumnPos());
   Lex.Lex();
 
   Function *F;
   unsigned FunctionNumber = -1;
   SmallVector<unsigned> UnnamedArgNums;
-  return parseFunctionHeader(F, true, FunctionNumber, UnnamedArgNums) ||
-         parseOptionalFunctionMetadata(*F) ||
-         parseFunctionBody(*F, FunctionNumber, UnnamedArgNums);
+  bool RetValue =
+      parseFunctionHeader(F, true, FunctionNumber, UnnamedArgNums) ||
+      parseOptionalFunctionMetadata(*F) ||
+      parseFunctionBody(*F, FunctionNumber, UnnamedArgNums);
+  if (ParserContext)
+    ParserContext->addFunctionLocation(
+        F, FileLocRange(FunctionStart, Lex.getPrevTokEndLineColumnPos()));
+
+  return RetValue;
 }
 
 /// parseGlobalType
@@ -7017,6 +7025,8 @@ bool LLParser::parseFunctionBody(Function &Fn, unsigned FunctionNumber,
 /// parseBasicBlock
 ///   ::= (LabelStr|LabelID)? Instruction*
 bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
+  FileLoc BBStart(Lex.getTokLineColumnPos());
+
   // If this basic block starts out with a name, remember it.
   std::string Name;
   int NameID = -1;
@@ -7058,6 +7068,7 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
       TrailingDbgRecord.emplace_back(DR, DeleteDbgRecord);
     }
 
+    FileLoc InstStart(Lex.getTokLineColumnPos());
     // This instruction may have three possibilities for a name: a) none
     // specified, b) name specified "%foo =", c) number specified: "%4 =".
     LocTy NameLoc = Lex.getLoc();
@@ -7107,8 +7118,16 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
     for (DbgRecordPtr &DR : TrailingDbgRecord)
       BB->insertDbgRecordBefore(DR.release(), Inst->getIterator());
     TrailingDbgRecord.clear();
+    if (ParserContext) {
+      ParserContext->addInstructionLocation(
+          Inst, FileLocRange(InstStart, Lex.getPrevTokEndLineColumnPos()));
+    }
   } while (!Inst->isTerminator());
 
+  if (ParserContext)
+    ParserContext->addBlockLocation(
+        BB, FileLocRange(BBStart, Lex.getPrevTokEndLineColumnPos()));
+
   assert(TrailingDbgRecord.empty() &&
          "All debug values should have been attached to an instruction.");
 
diff --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp
index 07fdce9..c5346d0 100644
--- a/llvm/lib/AsmParser/Parser.cpp
+++ b/llvm/lib/AsmParser/Parser.cpp
@@ -24,33 +24,38 @@ using namespace llvm;
 static bool parseAssemblyInto(MemoryBufferRef F, Module *M,
                               ModuleSummaryIndex *Index, SMDiagnostic &Err,
                               SlotMapping *Slots, bool UpgradeDebugInfo,
-                              DataLayoutCallbackTy DataLayoutCallback) {
+                              DataLayoutCallbackTy DataLayoutCallback,
+                              AsmParserContext *ParserContext = nullptr) {
   SourceMgr SM;
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(F);
   SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
 
   std::optional<LLVMContext> OptContext;
   return LLParser(F.getBuffer(), SM, Err, M, Index,
-                  M ? M->getContext() : OptContext.emplace(), Slots)
+                  M ? M->getContext() : OptContext.emplace(), Slots,
+                  ParserContext)
       .Run(UpgradeDebugInfo, DataLayoutCallback);
 }
 
 bool llvm::parseAssemblyInto(MemoryBufferRef F, Module *M,
                              ModuleSummaryIndex *Index, SMDiagnostic &Err,
                              SlotMapping *Slots,
-                             DataLayoutCallbackTy DataLayoutCallback) {
+                             DataLayoutCallbackTy DataLayoutCallback,
+                             AsmParserContext *ParserContext) {
   return ::parseAssemblyInto(F, M, Index, Err, Slots,
-                             /*UpgradeDebugInfo*/ true, DataLayoutCallback);
+                             /*UpgradeDebugInfo*/ true, DataLayoutCallback,
+                             ParserContext);
 }
 
 std::unique_ptr<Module>
 llvm::parseAssembly(MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
-                    SlotMapping *Slots,
-                    DataLayoutCallbackTy DataLayoutCallback) {
+                    SlotMapping *Slots, DataLayoutCallbackTy DataLayoutCallback,
+                    AsmParserContext *ParserContext) {
   std::unique_ptr<Module> M =
       std::make_unique<Module>(F.getBufferIdentifier(), Context);
 
-  if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, DataLayoutCallback))
+  if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, DataLayoutCallback,
+                        ParserContext))
     return nullptr;
 
   return M;
@@ -133,12 +138,14 @@ ParsedModuleAndIndex llvm::parseAssemblyFileWithIndexNoUpgradeDebugInfo(
                                       DataLayoutCallback);
 }
 
-std::unique_ptr<Module> llvm::parseAssemblyString(StringRef AsmString,
-                                                  SMDiagnostic &Err,
-                                                  LLVMContext &Context,
-                                                  SlotMapping *Slots) {
+std::unique_ptr<Module>
+llvm::parseAssemblyString(StringRef AsmString, SMDiagnostic &Err,
+                          LLVMContext &Context, SlotMapping *Slots,
+                          AsmParserContext *ParserContext) {
   MemoryBufferRef F(AsmString, "<string>");
-  return parseAssembly(F, Err, Context, Slots);
+  return parseAssembly(
+      F, Err, Context, Slots, [](StringRef, StringRef) { return std::nullopt; },
+      ParserContext);
 }
 
 static bool parseSummaryIndexAssemblyInto(MemoryBufferRef F,
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index aaee1f0..466dcb0 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7143,6 +7143,8 @@ Error BitcodeReader::materializeModule() {
 
   UpgradeARCRuntime(*TheModule);
 
+  copyModuleAttrToFunctions(*TheModule);
+
   return Error::success();
 }
 
@@ -8601,7 +8603,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
     case bitc::FS_FLAGS: { // [flags]
       uint64_t Flags = Record[0];
       // Scan flags.
-      assert(Flags <= 0x2ff && "Unexpected bits in flag");
+      assert(Flags <= 0x7ff && "Unexpected bits in flag");
 
       bool EnableSplitLTOUnit = Flags & 0x8;
       bool UnifiedLTO = Flags & 0x200;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 9288d7e..9c0b68b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -334,7 +334,7 @@ public:
                                const DIE &TyDIE);
 
 protected:
-  ~DwarfUnit();
+  ~DwarfUnit() override;
 
   /// Create new static data member DIE.
   DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT);
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 0522698..c1365f4 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -64,7 +64,6 @@ CGOPT_EXP(uint64_t, LargeDataThreshold)
 CGOPT(ExceptionHandling, ExceptionModel)
 CGOPT_EXP(CodeGenFileType, FileType)
 CGOPT(FramePointerKind, FramePointerUsage)
-CGOPT(bool, EnableUnsafeFPMath)
 CGOPT(bool, EnableNoInfsFPMath)
 CGOPT(bool, EnableNoNaNsFPMath)
 CGOPT(bool, EnableNoSignedZerosFPMath)
@@ -219,12 +218,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
                      "Enable frame pointer elimination")));
   CGBINDOPT(FramePointerUsage);
 
-  static cl::opt<bool> EnableUnsafeFPMath(
-      "enable-unsafe-fp-math",
-      cl::desc("Enable optimizations that may decrease FP precision"),
-      cl::init(false));
-  CGBINDOPT(EnableUnsafeFPMath);
-
   static cl::opt<bool> EnableNoInfsFPMath(
       "enable-no-infs-fp-math",
       cl::desc("Enable FP math optimizations that assume no +-Infs"),
@@ -552,7 +545,6 @@ TargetOptions
 codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   TargetOptions Options;
   Options.AllowFPOpFusion = getFuseFPOps();
-  Options.UnsafeFPMath = getEnableUnsafeFPMath();
   Options.NoInfsFPMath = getEnableNoInfsFPMath();
   Options.NoNaNsFPMath = getEnableNoNaNsFPMath();
   Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath();
@@ -706,7 +698,6 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
   if (getStackRealign())
     NewAttrs.addAttribute("stackrealign");
 
-  HANDLE_BOOL_ATTR(EnableUnsafeFPMathView, "unsafe-fp-math");
   HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math");
   HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math");
   HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math");
diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
index 2cba6f0..0665437 100644
--- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -62,7 +62,7 @@ public:
   static std::unique_ptr<WorkListMaintainer>
   create(Level Lvl, WorkListTy &WorkList, MachineRegisterInfo &MRI);
 
-  virtual ~WorkListMaintainer() = default;
+  ~WorkListMaintainer() override = default;
 
   void reportFullyCreatedInstrs() {
     LLVM_DEBUG({
@@ -95,7 +95,7 @@ public:
   WorkListMaintainerImpl(WorkListTy &WorkList, MachineRegisterInfo &MRI)
       : WorkList(WorkList), MRI(MRI) {}
 
-  virtual ~WorkListMaintainerImpl() = default;
+  ~WorkListMaintainerImpl() override = default;
 
   void reset() override {
     DeferList.clear();
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 884c3f1..1fe38d6 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -139,7 +139,7 @@ class DILocationVerifier : public GISelChangeObserver {
 
 public:
   DILocationVerifier() = default;
-  ~DILocationVerifier() = default;
+  ~DILocationVerifier() override = default;
 
   const Instruction *getCurrentInst() const { return CurrInst; }
   void setCurrentInst(const Instruction *Inst) { CurrInst = Inst; }
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
index b655375..94e3a82 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
@@ -69,7 +69,7 @@ public:
   static char ID;
 
   LiveDebugValuesLegacy();
-  ~LiveDebugValuesLegacy() = default;
+  ~LiveDebugValuesLegacy() override = default;
 
   /// Calculate the liveness information for the given machine function.
   bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index b9ea03f..1c4b2f9 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -1094,7 +1094,7 @@ public:
   /// Default construct and initialize the pass.
   VarLocBasedLDV();
 
-  ~VarLocBasedLDV();
+  ~VarLocBasedLDV() override;
 
   /// Print to ostream with a message.
   void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V,
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index 99be1fc0..75ca06a 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -42,6 +42,13 @@ static cl::opt<std::string>
 cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
                          cl::desc("Weight for machine opcode embeddings"),
                          cl::cat(MIR2VecCategory));
+cl::opt<float> CommonOperandWeight(
+    "mir2vec-common-operand-weight", cl::Optional, cl::init(1.0),
+    cl::desc("Weight for common operand embeddings"), cl::cat(MIR2VecCategory));
+cl::opt<float>
+    RegOperandWeight("mir2vec-reg-operand-weight", cl::Optional, cl::init(1.0),
+                     cl::desc("Weight for register operand embeddings"),
+                     cl::cat(MIR2VecCategory));
 cl::opt<MIR2VecKind> MIR2VecEmbeddingKind(
     "mir2vec-kind", cl::Optional,
     cl::values(clEnumValN(MIR2VecKind::Symbolic, "symbolic",
@@ -56,26 +63,52 @@ cl::opt<MIR2VecKind> MIR2VecEmbeddingKind(
 // Vocabulary
 //===----------------------------------------------------------------------===//
 
-MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries,
-                             const TargetInstrInfo &TII)
-    : TII(TII) {
+MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeMap, VocabMap &&CommonOperandMap,
+                             VocabMap &&PhysicalRegisterMap,
+                             VocabMap &&VirtualRegisterMap,
+                             const TargetInstrInfo &TII,
+                             const TargetRegisterInfo &TRI,
+                             const MachineRegisterInfo &MRI)
+    : TII(TII), TRI(TRI), MRI(MRI) {
   buildCanonicalOpcodeMapping();
-
   unsigned CanonicalOpcodeCount = UniqueBaseOpcodeNames.size();
   assert(CanonicalOpcodeCount > 0 &&
          "No canonical opcodes found for target - invalid vocabulary");
-  Layout.OperandBase = CanonicalOpcodeCount;
-  generateStorage(OpcodeEntries);
+
+  buildRegisterOperandMapping();
+
+  // Define layout of vocabulary sections
+  Layout.OpcodeBase = 0;
+  Layout.CommonOperandBase = CanonicalOpcodeCount;
+  // We expect same classes for physical and virtual registers
+  Layout.PhyRegBase = Layout.CommonOperandBase + std::size(CommonOperandNames);
+  Layout.VirtRegBase = Layout.PhyRegBase + RegisterOperandNames.size();
+
+  generateStorage(OpcodeMap, CommonOperandMap, PhysicalRegisterMap,
+                  VirtualRegisterMap);
   Layout.TotalEntries = Storage.size();
 }
 
-Expected<MIRVocabulary> MIRVocabulary::create(VocabMap &&Entries,
-                                              const TargetInstrInfo &TII) {
-  if (Entries.empty())
+Expected<MIRVocabulary>
+MIRVocabulary::create(VocabMap &&OpcodeMap, VocabMap &&CommonOperandMap,
+                      VocabMap &&PhyRegMap, VocabMap &&VirtRegMap,
+                      const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+                      const MachineRegisterInfo &MRI) {
+  if (OpcodeMap.empty() || CommonOperandMap.empty() || PhyRegMap.empty() ||
+      VirtRegMap.empty())
     return createStringError(errc::invalid_argument,
                              "Empty vocabulary entries provided");
 
-  return MIRVocabulary(std::move(Entries), TII);
+  MIRVocabulary Vocab(std::move(OpcodeMap), std::move(CommonOperandMap),
+                      std::move(PhyRegMap), std::move(VirtRegMap), TII, TRI,
+                      MRI);
+
+  // Validate Storage after construction
+  if (!Vocab.Storage.isValid())
+    return createStringError(errc::invalid_argument,
+                             "Failed to create valid vocabulary storage");
+  Vocab.ZeroEmbedding = Embedding(Vocab.Storage.getDimension(), 0.0);
+  return std::move(Vocab);
 }
 
 std::string MIRVocabulary::extractBaseOpcodeName(StringRef InstrName) {
@@ -122,22 +155,74 @@ unsigned MIRVocabulary::getCanonicalOpcodeIndex(unsigned Opcode) const {
   return getCanonicalIndexForBaseName(BaseOpcode);
 }
 
+unsigned
+MIRVocabulary::getCanonicalIndexForOperandName(StringRef OperandName) const {
+  auto It = std::find(std::begin(CommonOperandNames),
+                      std::end(CommonOperandNames), OperandName);
+  assert(It != std::end(CommonOperandNames) &&
+         "Operand name not found in common operands");
+  return Layout.CommonOperandBase +
+         std::distance(std::begin(CommonOperandNames), It);
+}
+
+unsigned
+MIRVocabulary::getCanonicalIndexForRegisterClass(StringRef RegName,
+                                                 bool IsPhysical) const {
+  auto It = std::find(RegisterOperandNames.begin(), RegisterOperandNames.end(),
+                      RegName);
+  assert(It != RegisterOperandNames.end() &&
+         "Register name not found in register operands");
+  unsigned LocalIndex = std::distance(RegisterOperandNames.begin(), It);
+  return (IsPhysical ? Layout.PhyRegBase : Layout.VirtRegBase) + LocalIndex;
+}
+
 std::string MIRVocabulary::getStringKey(unsigned Pos) const {
   assert(Pos < Layout.TotalEntries && "Position out of bounds in vocabulary");
 
-  // For now, all entries are opcodes since we only have one section
-  if (Pos < Layout.OperandBase && Pos < UniqueBaseOpcodeNames.size()) {
+  // Handle opcodes section
+  if (Pos < Layout.CommonOperandBase) {
     // Convert canonical index back to base opcode name
     auto It = UniqueBaseOpcodeNames.begin();
     std::advance(It, Pos);
+    assert(It != UniqueBaseOpcodeNames.end() &&
+           "Canonical index out of bounds in opcode section");
     return *It;
   }
 
-  llvm_unreachable("Invalid position in vocabulary");
-  return "";
+  auto getLocalIndex = [](unsigned Pos, size_t BaseOffset, size_t Bound,
+                          const char *Msg) {
+    unsigned LocalIndex = Pos - BaseOffset;
+    assert(LocalIndex < Bound && Msg);
+    return LocalIndex;
+  };
+
+  // Handle common operands section
+  if (Pos < Layout.PhyRegBase) {
+    unsigned LocalIndex = getLocalIndex(
+        Pos, Layout.CommonOperandBase, std::size(CommonOperandNames),
+        "Local index out of bounds in common operands");
+    return CommonOperandNames[LocalIndex].str();
+  }
+
+  // Handle physical registers section
+  if (Pos < Layout.VirtRegBase) {
+    unsigned LocalIndex =
+        getLocalIndex(Pos, Layout.PhyRegBase, RegisterOperandNames.size(),
+                      "Local index out of bounds in physical registers");
+    return "PhyReg_" + RegisterOperandNames[LocalIndex];
+  }
+
+  // Handle virtual registers section
+  unsigned LocalIndex =
+      getLocalIndex(Pos, Layout.VirtRegBase, RegisterOperandNames.size(),
+                    "Local index out of bounds in virtual registers");
+  return "VirtReg_" + RegisterOperandNames[LocalIndex];
 }
 
-void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
+void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap,
+                                    const VocabMap &CommonOperandsMap,
+                                    const VocabMap &PhyRegMap,
+                                    const VocabMap &VirtRegMap) {
 
   // Helper for handling missing entities in the vocabulary.
   // Currently, we use a zero vector. In the future, we will throw an error to
@@ -151,14 +236,14 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
 
   // Initialize opcode embeddings section
   unsigned EmbeddingDim = OpcodeMap.begin()->second.size();
-  std::vector<Embedding> OpcodeEmbeddings(Layout.OperandBase,
+  std::vector<Embedding> OpcodeEmbeddings(Layout.CommonOperandBase,
                                           Embedding(EmbeddingDim));
 
   // Populate opcode embeddings using canonical mapping
   for (auto COpcodeName : UniqueBaseOpcodeNames) {
     if (auto It = OpcodeMap.find(COpcodeName); It != OpcodeMap.end()) {
       auto COpcodeIndex = getCanonicalIndexForBaseName(COpcodeName);
-      assert(COpcodeIndex < Layout.OperandBase &&
+      assert(COpcodeIndex < Layout.CommonOperandBase &&
              "Canonical index out of bounds");
       OpcodeEmbeddings[COpcodeIndex] = It->second;
     } else {
@@ -166,8 +251,39 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
     }
   }
 
-  // TODO: Add operand/argument embeddings as additional sections
-  // This will require extending the vocabulary format and layout
+  // Initialize common operand embeddings section
+  std::vector<Embedding> CommonOperandEmbeddings(std::size(CommonOperandNames),
+                                                 Embedding(EmbeddingDim));
+  unsigned OperandIndex = 0;
+  for (const auto &CommonOperandName : CommonOperandNames) {
+    if (auto It = CommonOperandsMap.find(CommonOperandName.str());
+        It != CommonOperandsMap.end()) {
+      CommonOperandEmbeddings[OperandIndex] = It->second;
+    } else {
+      handleMissingEntity(CommonOperandName);
+    }
+    ++OperandIndex;
+  }
+
+  // Helper lambda for creating register operand embeddings
+  auto createRegisterEmbeddings = [&](const VocabMap &RegMap) {
+    std::vector<Embedding> RegEmbeddings(TRI.getNumRegClasses(),
+                                         Embedding(EmbeddingDim));
+    unsigned RegOperandIndex = 0;
+    for (const auto &RegOperandName : RegisterOperandNames) {
+      if (auto It = RegMap.find(RegOperandName); It != RegMap.end())
+        RegEmbeddings[RegOperandIndex] = It->second;
+      else
+        handleMissingEntity(RegOperandName);
+      ++RegOperandIndex;
+    }
+    return RegEmbeddings;
+  };
+
+  // Initialize register operand embeddings sections
+  std::vector<Embedding> PhyRegEmbeddings = createRegisterEmbeddings(PhyRegMap);
+  std::vector<Embedding> VirtRegEmbeddings =
+      createRegisterEmbeddings(VirtRegMap);
 
   // Scale the vocabulary sections based on the provided weights
   auto scaleVocabSection = [](std::vector<Embedding> &Embeddings,
@@ -176,9 +292,20 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
       Embedding *= Weight;
   };
   scaleVocabSection(OpcodeEmbeddings, OpcWeight);
-
-  std::vector<std::vector<Embedding>> Sections(1);
-  Sections[0] = std::move(OpcodeEmbeddings);
+  scaleVocabSection(CommonOperandEmbeddings, CommonOperandWeight);
+  scaleVocabSection(PhyRegEmbeddings, RegOperandWeight);
+  scaleVocabSection(VirtRegEmbeddings, RegOperandWeight);
+
+  std::vector<std::vector<Embedding>> Sections(
+      static_cast<unsigned>(Section::MaxSections));
+  Sections[static_cast<unsigned>(Section::Opcodes)] =
+      std::move(OpcodeEmbeddings);
+  Sections[static_cast<unsigned>(Section::CommonOperands)] =
+      std::move(CommonOperandEmbeddings);
+  Sections[static_cast<unsigned>(Section::PhyRegisters)] =
+      std::move(PhyRegEmbeddings);
+  Sections[static_cast<unsigned>(Section::VirtRegisters)] =
+      std::move(VirtRegEmbeddings);
 
   Storage = ir2vec::VocabStorage(std::move(Sections));
 }
@@ -199,26 +326,94 @@ void MIRVocabulary::buildCanonicalOpcodeMapping() {
                     << " unique base opcodes\n");
 }
 
-Expected<MIRVocabulary>
-MIRVocabulary::createDummyVocabForTest(const TargetInstrInfo &TII,
-                                       unsigned Dim) {
+void MIRVocabulary::buildRegisterOperandMapping() {
+  // Check if already built
+  if (!RegisterOperandNames.empty())
+    return;
+
+  for (unsigned RC = 0; RC < TRI.getNumRegClasses(); ++RC) {
+    const TargetRegisterClass *RegClass = TRI.getRegClass(RC);
+    if (!RegClass)
+      continue;
+
+    // Get the register class name
+    StringRef ClassName = TRI.getRegClassName(RegClass);
+    RegisterOperandNames.push_back(ClassName.str());
+  }
+}
+
+unsigned MIRVocabulary::getCommonOperandIndex(
+    MachineOperand::MachineOperandType OperandType) const {
+  assert(OperandType != MachineOperand::MO_Register &&
+         "Expected non-register operand type");
+  assert(OperandType > MachineOperand::MO_Register &&
+         OperandType < MachineOperand::MO_Last && "Operand type out of bounds");
+  return static_cast<unsigned>(OperandType) - 1;
+}
+
+unsigned MIRVocabulary::getRegisterOperandIndex(Register Reg) const {
+  assert(!RegisterOperandNames.empty() && "Register operand mapping not built");
+  assert(Reg.isValid() && "Invalid register; not expected here");
+  assert((Reg.isPhysical() || Reg.isVirtual()) &&
+         "Expected a physical or virtual register");
+
+  const TargetRegisterClass *RegClass = nullptr;
+
+  // For physical registers, use TRI to get minimal register class as a
+  // physical register can belong to multiple classes. For virtual
+  // registers, use MRI to uniquely identify the assigned register class.
+  if (Reg.isPhysical())
+    RegClass = TRI.getMinimalPhysRegClass(Reg);
+  else
+    RegClass = MRI.getRegClass(Reg);
+
+  if (RegClass)
+    return RegClass->getID();
+  // Fallback for registers without a class (shouldn't happen)
+  llvm_unreachable("Register operand without a valid register class");
+  return 0;
+}
+
+Expected<MIRVocabulary> MIRVocabulary::createDummyVocabForTest(
+    const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+    const MachineRegisterInfo &MRI, unsigned Dim) {
   assert(Dim > 0 && "Dimension must be greater than zero");
 
   float DummyVal = 0.1f;
 
-  // Create dummy embeddings for all canonical opcode names
-  VocabMap DummyVocabMap;
+  VocabMap DummyOpcMap, DummyOperandMap, DummyPhyRegMap, DummyVirtRegMap;
+
+  // Process opcodes directly without creating temporary vocabulary
   for (unsigned Opcode = 0; Opcode < TII.getNumOpcodes(); ++Opcode) {
     std::string BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode));
-    if (DummyVocabMap.count(BaseOpcode) == 0) {
-      // Only add if not already present
-      DummyVocabMap[BaseOpcode] = Embedding(Dim, DummyVal);
+    if (DummyOpcMap.count(BaseOpcode) == 0) { // Only add if not already present
+      DummyOpcMap[BaseOpcode] = Embedding(Dim, DummyVal);
       DummyVal += 0.1f;
     }
   }
 
-  // Create and return vocabulary with dummy embeddings
-  return MIRVocabulary::create(std::move(DummyVocabMap), TII);
+  // Add common operands
+  for (const auto &CommonOperandName : CommonOperandNames) {
+    DummyOperandMap[CommonOperandName.str()] = Embedding(Dim, DummyVal);
+    DummyVal += 0.1f;
+  }
+
+  // Process register classes directly
+  for (unsigned RC = 0; RC < TRI.getNumRegClasses(); ++RC) {
+    const TargetRegisterClass *RegClass = TRI.getRegClass(RC);
+    if (!RegClass)
+      continue;
+
+    std::string ClassName = TRI.getRegClassName(RegClass);
+    DummyPhyRegMap[ClassName] = Embedding(Dim, DummyVal);
+    DummyVirtRegMap[ClassName] = Embedding(Dim, DummyVal);
+    DummyVal += 0.1f;
+  }
+
+  // Create vocabulary directly without temporary instance
+  return MIRVocabulary::create(
+      std::move(DummyOpcMap), std::move(DummyOperandMap),
+      std::move(DummyPhyRegMap), std::move(DummyVirtRegMap), TII, TRI, MRI);
 }
 
 //===----------------------------------------------------------------------===//
@@ -236,9 +431,10 @@ StringRef MIR2VecVocabLegacyAnalysis::getPassName() const {
   return "MIR2Vec Vocabulary Analysis";
 }
 
-Error MIR2VecVocabLegacyAnalysis::readVocabulary() {
-  // TODO: Extend vocabulary format to support multiple sections
-  // (opcodes, operands, etc.) similar to IR2Vec structure
+Error MIR2VecVocabLegacyAnalysis::readVocabulary(VocabMap &OpcodeVocab,
+                                                 VocabMap &CommonOperandVocab,
+                                                 VocabMap &PhyRegVocabMap,
+                                                 VocabMap &VirtRegVocabMap) {
   if (VocabFile.empty())
     return createStringError(
         errc::invalid_argument,
@@ -255,21 +451,47 @@ Error MIR2VecVocabLegacyAnalysis::readVocabulary() {
   if (!ParsedVocabValue)
     return ParsedVocabValue.takeError();
 
-  unsigned Dim = 0;
+  unsigned OpcodeDim = 0, CommonOperandDim = 0, PhyRegOperandDim = 0,
+           VirtRegOperandDim = 0;
+  if (auto Err = ir2vec::VocabStorage::parseVocabSection(
+          "Opcodes", *ParsedVocabValue, OpcodeVocab, OpcodeDim))
+    return Err;
+
+  if (auto Err = ir2vec::VocabStorage::parseVocabSection(
+          "CommonOperands", *ParsedVocabValue, CommonOperandVocab,
+          CommonOperandDim))
+    return Err;
+
+  if (auto Err = ir2vec::VocabStorage::parseVocabSection(
+          "PhysicalRegisters", *ParsedVocabValue, PhyRegVocabMap,
+          PhyRegOperandDim))
+    return Err;
+
   if (auto Err = ir2vec::VocabStorage::parseVocabSection(
-          "entities", *ParsedVocabValue, StrVocabMap, Dim))
+          "VirtualRegisters", *ParsedVocabValue, VirtRegVocabMap,
+          VirtRegOperandDim))
     return Err;
 
+  // All sections must have the same embedding dimension
+  if (!(OpcodeDim == CommonOperandDim && CommonOperandDim == PhyRegOperandDim &&
+        PhyRegOperandDim == VirtRegOperandDim)) {
+    return createStringError(
+        errc::illegal_byte_sequence,
+        "MIR2Vec vocabulary sections have different dimensions");
+  }
+
   return Error::success();
 }
 
 Expected<mir2vec::MIRVocabulary>
 MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
-  if (StrVocabMap.empty()) {
-    if (Error Err = readVocabulary()) {
-      return std::move(Err);
-    }
-  }
+  if (Vocab.has_value())
+    return std::move(Vocab.value());
+
+  VocabMap OpcMap, CommonOperandMap, PhyRegMap, VirtRegMap;
+  if (Error Err =
+          readVocabulary(OpcMap, CommonOperandMap, PhyRegMap, VirtRegMap))
+    return std::move(Err);
 
   // Get machine module info to access machine functions and target info
   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
@@ -280,8 +502,24 @@ MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
       continue;
 
     if (auto *MF = MMI.getMachineFunction(F)) {
-      const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-      return mir2vec::MIRVocabulary::create(std::move(StrVocabMap), *TII);
+      auto &Subtarget = MF->getSubtarget();
+      const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+      if (!TII) {
+        return createStringError(errc::invalid_argument,
+                                 "No TargetInstrInfo available; cannot create "
+                                 "MIR2Vec vocabulary");
+      }
+
+      const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+      if (!TRI) {
+        return createStringError(errc::invalid_argument,
+                                 "No TargetRegisterInfo available; cannot "
+                                 "create MIR2Vec vocabulary");
+      }
+
+      return mir2vec::MIRVocabulary::create(
+          std::move(OpcMap), std::move(CommonOperandMap), std::move(PhyRegMap),
+          std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo());
     }
   }
 
@@ -351,9 +589,14 @@ Embedding SymbolicMIREmbedder::computeEmbeddings(const MachineInstr &MI) const {
   if (MI.isDebugInstr())
     return Embedding(Dimension, 0);
 
-  // Todo: Add operand/argument contributions
+  // Opcode embedding
+  Embedding InstructionEmbedding = Vocab[MI.getOpcode()];
+
+  // Add operand contributions
+  for (const MachineOperand &MO : MI.operands())
+    InstructionEmbedding += Vocab[MO];
 
-  return Vocab[MI.getOpcode()];
+  return InstructionEmbedding;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 1cb57a4..ba0b025 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1137,7 +1137,7 @@ public:
     MF.setDelegate(this);
   }
 
-  ~SlotIndexUpdateDelegate() {
+  ~SlotIndexUpdateDelegate() override {
     MF.resetDelegate(this);
     for (auto MI : Insertions)
       Indexes->insertMachineInstrInMaps(*MI);
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index e1d39d6..493d8df 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -196,7 +196,7 @@ public:
   CopyRewriter(MachineInstr &MI) : Rewriter(MI) {
     assert(MI.isCopy() && "Expected copy instruction");
   }
-  virtual ~CopyRewriter() = default;
+  ~CopyRewriter() override = default;
 
   bool getNextRewritableSource(RegSubRegPair &Src,
                                RegSubRegPair &Dst) override {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 310d35d..d2ea652 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2476,16 +2476,17 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG,
 /// masked vector operation if the target supports it.
 static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
                                               bool ShouldCommuteOperands) {
-  // Match a select as operand 1. The identity constant that we are looking for
-  // is only valid as operand 1 of a non-commutative binop.
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+
+  // Match a select as operand 1. The identity constant that we are looking for
+  // is only valid as operand 1 of a non-commutative binop.
   if (ShouldCommuteOperands)
     std::swap(N0, N1);
 
-  unsigned SelOpcode = N1.getOpcode();
-  if ((SelOpcode != ISD::VSELECT && SelOpcode != ISD::SELECT) ||
-      !N1.hasOneUse())
+  SDValue Cond, TVal, FVal;
+  if (!sd_match(N1, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(TVal),
+                                          m_Value(FVal)))))
     return SDValue();
 
   // We can't hoist all instructions because of immediate UB (not speculatable).
@@ -2493,11 +2494,9 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
   if (!DAG.isSafeToSpeculativelyExecuteNode(N))
     return SDValue();
 
+  unsigned SelOpcode = N1.getOpcode();
   unsigned Opcode = N->getOpcode();
   EVT VT = N->getValueType(0);
-  SDValue Cond = N1.getOperand(0);
-  SDValue TVal = N1.getOperand(1);
-  SDValue FVal = N1.getOperand(2);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // This transform increases uses of N0, so freeze it to be safe.
@@ -13856,12 +13855,11 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
           Opcode == ISD::ANY_EXTEND) &&
          "Expected EXTEND dag node in input!");
 
-  if (!(N0->getOpcode() == ISD::SELECT || N0->getOpcode() == ISD::VSELECT) ||
-      !N0.hasOneUse())
+  SDValue Cond, Op1, Op2;
+  if (!sd_match(N0, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(Op1),
+                                          m_Value(Op2)))))
     return SDValue();
 
-  SDValue Op1 = N0->getOperand(1);
-  SDValue Op2 = N0->getOperand(2);
   if (!isCompatibleLoad(Op1, Opcode) || !isCompatibleLoad(Op2, Opcode))
     return SDValue();
 
@@ -13883,7 +13881,7 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
 
   SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1);
   SDValue Ext2 = DAG.getNode(Opcode, DL, VT, Op2);
-  return DAG.getSelect(DL, VT, N0->getOperand(0), Ext1, Ext2);
+  return DAG.getSelect(DL, VT, Cond, Ext1, Ext2);
 }
 
 /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
@@ -17462,8 +17460,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // fold (fsub (fpext (fneg (fmul, x, y))), z)
   //   -> (fneg (fma (fpext x), (fpext y), z))
   // Note: This could be removed with appropriate canonicalization of the
-  // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
-  // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+  // input expression into (fneg (fadd (fpext (fmul, x, y)), z)). However, the
+  // command line flag -fp-contract=fast and fast-math flag contract prevent
   // from implementing the canonicalization in visitFSUB.
   if (matcher.match(N0, ISD::FP_EXTEND)) {
     SDValue N00 = N0.getOperand(0);
@@ -17487,7 +17485,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   //   -> (fneg (fma (fpext x)), (fpext y), z)
   // Note: This could be removed with appropriate canonicalization of the
   // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
-  // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+  // command line flag -fp-contract=fast and fast-math flag contract prevent
   // from implementing the canonicalization in visitFSUB.
   if (matcher.match(N0, ISD::FNEG)) {
     SDValue N00 = N0.getOperand(0);
@@ -29620,13 +29618,14 @@ static SDValue takeInexpensiveLog2(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   }
 
   // c ? X : Y -> c ? Log2(X) : Log2(Y)
-  if ((Op.getOpcode() == ISD::SELECT || Op.getOpcode() == ISD::VSELECT) &&
-      Op.hasOneUse()) {
-    if (SDValue LogX = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(1),
-                                           Depth + 1, AssumeNonZero))
-      if (SDValue LogY = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(2),
-                                             Depth + 1, AssumeNonZero))
-        return DAG.getSelect(DL, VT, Op.getOperand(0), LogX, LogY);
+  SDValue Cond, TVal, FVal;
+  if (sd_match(Op, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(TVal),
+                                         m_Value(FVal))))) {
+    if (SDValue LogX =
+            takeInexpensiveLog2(DAG, DL, VT, TVal, Depth + 1, AssumeNonZero))
+      if (SDValue LogY =
+              takeInexpensiveLog2(DAG, DL, VT, FVal, Depth + 1, AssumeNonZero))
+        return DAG.getSelect(DL, VT, Cond, LogX, LogY);
   }
 
   // log2(umin(X, Y)) -> umin(log2(X), log2(Y))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 20a0efd..dcf2df3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1977,8 +1977,13 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
     Register InReg = FuncInfo.InitializeRegForValue(Inst);
 
+    std::optional<CallingConv::ID> CallConv;
+    auto *CI = dyn_cast<CallInst>(Inst);
+    if (CI && !CI->isInlineAsm())
+      CallConv = CI->getCallingConv();
+
     RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
-                     Inst->getType(), std::nullopt);
+                     Inst->getType(), CallConv);
     SDValue Chain = DAG.getEntryNode();
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index 826e412..8358105 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -319,7 +319,7 @@ bool ShrinkWrapImpl::useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS,
       return isa<GlobalValue>(UO);
     }
     if (const PseudoSourceValue *PSV = Op->getPseudoValue())
-      return PSV->isJumpTable();
+      return PSV->isJumpTable() || PSV->isConstantPool();
     return false;
   };
   // Load/store operations may access the stack indirectly when we previously
diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index 5eb86e7..049efe8 100644
--- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -51,7 +51,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const {
 /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
 /// that the rounding mode of the FPU can change from its default.
 bool TargetOptions::HonorSignDependentRoundingFPMath() const {
-  return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
+  return HonorSignDependentRoundingFPMathOption;
 }
 
 /// NOTE: There are targets that still do not support the debug entry values
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 10f915d..b838e36 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5262,33 +5262,47 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       return;
     }
 
+    auto GetMaybeAlign = [](Value *Op) {
+      if (auto *CI = dyn_cast<ConstantInt>(Op)) {
+        uint64_t Val = CI->getZExtValue();
+        if (Val == 0)
+          return MaybeAlign();
+        if (isPowerOf2_64(Val))
+          return MaybeAlign(Val);
+      }
+      reportFatalUsageError("Invalid alignment argument");
+    };
+    auto GetAlign = [&](Value *Op) {
+      MaybeAlign Align = GetMaybeAlign(Op);
+      if (Align)
+        return *Align;
+      reportFatalUsageError("Invalid zero alignment argument");
+    };
+
     const DataLayout &DL = CI->getDataLayout();
     switch (NewFn->getIntrinsicID()) {
     case Intrinsic::masked_load:
       NewCall = Builder.CreateMaskedLoad(
-          CI->getType(), CI->getArgOperand(0),
-          cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue(),
+          CI->getType(), CI->getArgOperand(0), GetAlign(CI->getArgOperand(1)),
           CI->getArgOperand(2), CI->getArgOperand(3));
       break;
     case Intrinsic::masked_gather:
       NewCall = Builder.CreateMaskedGather(
           CI->getType(), CI->getArgOperand(0),
-          DL.getValueOrABITypeAlignment(
-              cast<ConstantInt>(CI->getArgOperand(1))->getMaybeAlignValue(),
-              CI->getType()->getScalarType()),
+          DL.getValueOrABITypeAlignment(GetMaybeAlign(CI->getArgOperand(1)),
+                                        CI->getType()->getScalarType()),
           CI->getArgOperand(2), CI->getArgOperand(3));
       break;
     case Intrinsic::masked_store:
       NewCall = Builder.CreateMaskedStore(
           CI->getArgOperand(0), CI->getArgOperand(1),
-          cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue(),
-          CI->getArgOperand(3));
+          GetAlign(CI->getArgOperand(2)), CI->getArgOperand(3));
       break;
     case Intrinsic::masked_scatter:
       NewCall = Builder.CreateMaskedScatter(
           CI->getArgOperand(0), CI->getArgOperand(1),
           DL.getValueOrABITypeAlignment(
-              cast<ConstantInt>(CI->getArgOperand(2))->getMaybeAlignValue(),
+              GetMaybeAlign(CI->getArgOperand(2)),
               CI->getArgOperand(0)->getType()->getScalarType()),
           CI->getArgOperand(3));
       break;
@@ -6045,6 +6059,120 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
   }
 }
 
+// Check if the function attribute is not present and set it.
+static void setFunctionAttrIfNotSet(Function &F, StringRef FnAttrName,
+                                    StringRef Value) {
+  if (!F.hasFnAttribute(FnAttrName))
+    F.addFnAttr(FnAttrName, Value);
+}
+
+// Check if the function attribute is not present and set it if needed.
+// If the attribute is "false" then removes it.
+// If the attribute is "true" resets it to a valueless attribute.
+static void ConvertFunctionAttr(Function &F, bool Set, StringRef FnAttrName) {
+  if (!F.hasFnAttribute(FnAttrName)) {
+    if (Set)
+      F.addFnAttr(FnAttrName);
+  } else {
+    auto A = F.getFnAttribute(FnAttrName);
+    if ("false" == A.getValueAsString())
+      F.removeFnAttr(FnAttrName);
+    else if ("true" == A.getValueAsString()) {
+      F.removeFnAttr(FnAttrName);
+      F.addFnAttr(FnAttrName);
+    }
+  }
+}
+
+void llvm::copyModuleAttrToFunctions(Module &M) {
+  Triple T(M.getTargetTriple());
+  if (!T.isThumb() && !T.isARM() && !T.isAArch64())
+    return;
+
+  uint64_t BTEValue = 0;
+  uint64_t BPPLRValue = 0;
+  uint64_t GCSValue = 0;
+  uint64_t SRAValue = 0;
+  uint64_t SRAALLValue = 0;
+  uint64_t SRABKeyValue = 0;
+
+  NamedMDNode *ModFlags = M.getModuleFlagsMetadata();
+  if (ModFlags) {
+    for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) {
+      MDNode *Op = ModFlags->getOperand(I);
+      if (Op->getNumOperands() != 3)
+        continue;
+
+      MDString *ID = dyn_cast_or_null<MDString>(Op->getOperand(1));
+      auto *CI = mdconst::dyn_extract<ConstantInt>(Op->getOperand(2));
+      if (!ID || !CI)
+        continue;
+
+      StringRef IDStr = ID->getString();
+      uint64_t *ValPtr = IDStr == "branch-target-enforcement"    ? &BTEValue
+                         : IDStr == "branch-protection-pauth-lr" ? &BPPLRValue
+                         : IDStr == "guarded-control-stack"      ? &GCSValue
+                         : IDStr == "sign-return-address"        ? &SRAValue
+                         : IDStr == "sign-return-address-all"    ? &SRAALLValue
+                         : IDStr == "sign-return-address-with-bkey"
+                             ? &SRABKeyValue
+                             : nullptr;
+      if (!ValPtr)
+        continue;
+
+      *ValPtr = CI->getZExtValue();
+      if (*ValPtr == 2)
+        return;
+    }
+  }
+
+  bool BTE = BTEValue == 1;
+  bool BPPLR = BPPLRValue == 1;
+  bool GCS = GCSValue == 1;
+  bool SRA = SRAValue == 1;
+
+  StringRef SignTypeValue = "non-leaf";
+  if (SRA && SRAALLValue == 1)
+    SignTypeValue = "all";
+
+  StringRef SignKeyValue = "a_key";
+  if (SRA && SRABKeyValue == 1)
+    SignKeyValue = "b_key";
+
+  for (Function &F : M.getFunctionList()) {
+    if (F.isDeclaration())
+      continue;
+
+    if (SRA) {
+      setFunctionAttrIfNotSet(F, "sign-return-address", SignTypeValue);
+      setFunctionAttrIfNotSet(F, "sign-return-address-key", SignKeyValue);
+    } else {
+      if (auto A = F.getFnAttribute("sign-return-address");
+          A.isValid() && "none" == A.getValueAsString()) {
+        F.removeFnAttr("sign-return-address");
+        F.removeFnAttr("sign-return-address-key");
+      }
+    }
+    ConvertFunctionAttr(F, BTE, "branch-target-enforcement");
+    ConvertFunctionAttr(F, BPPLR, "branch-protection-pauth-lr");
+    ConvertFunctionAttr(F, GCS, "guarded-control-stack");
+  }
+
+  if (BTE)
+    M.setModuleFlag(llvm::Module::Min, "branch-target-enforcement", 2);
+  if (BPPLR)
+    M.setModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr", 2);
+  if (GCS)
+    M.setModuleFlag(llvm::Module::Min, "guarded-control-stack", 2);
+  if (SRA) {
+    M.setModuleFlag(llvm::Module::Min, "sign-return-address", 2);
+    if (SRAALLValue == 1)
+      M.setModuleFlag(llvm::Module::Min, "sign-return-address-all", 2);
+    if (SRABKeyValue == 1)
+      M.setModuleFlag(llvm::Module::Min, "sign-return-address-with-bkey", 2);
+  }
+}
+
 static bool isOldLoopArgument(Metadata *MD) {
   auto *T = dyn_cast_or_null<MDTuple>(MD);
   if (!T)
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index a6353664..62fd62c 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -111,11 +111,13 @@ uint64_t ModuleSummaryIndex::getFlags() const {
     Flags |= 0x100;
   if (hasUnifiedLTO())
     Flags |= 0x200;
+  if (withInternalizeAndPromote())
+    Flags |= 0x400;
   return Flags;
 }
 
 void ModuleSummaryIndex::setFlags(uint64_t Flags) {
-  assert(Flags <= 0x2ff && "Unexpected bits in flag");
+  assert(Flags <= 0x7ff && "Unexpected bits in flag");
   // 1 bit: WithGlobalValueDeadStripping flag.
   // Set on combined index only.
   if (Flags & 0x1)
@@ -154,6 +156,10 @@ void ModuleSummaryIndex::setFlags(uint64_t Flags) {
   // Set on combined index only.
   if (Flags & 0x200)
     setUnifiedLTO();
+  // 1 bit: WithInternalizeAndPromote flag.
+  // Set on combined index only.
+  if (Flags & 0x400)
+    setWithInternalizeAndPromote();
 }
 
 // Collect for the given module the list of function it defines
diff --git a/llvm/lib/IRReader/IRReader.cpp b/llvm/lib/IRReader/IRReader.cpp
index a7e7dee..c16871f 100644
--- a/llvm/lib/IRReader/IRReader.cpp
+++ b/llvm/lib/IRReader/IRReader.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/IRReader/IRReader.h"
 #include "llvm-c/IRReader.h"
+#include "llvm/AsmParser/AsmParserContext.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/LLVMContext.h"
@@ -68,7 +69,8 @@ std::unique_ptr<Module> llvm::getLazyIRFileModule(StringRef Filename,
 
 std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
                                       LLVMContext &Context,
-                                      ParserCallbacks Callbacks) {
+                                      ParserCallbacks Callbacks,
+                                      llvm::AsmParserContext *ParserContext) {
   NamedRegionTimer T(TimeIRParsingName, TimeIRParsingDescription,
                      TimeIRParsingGroupName, TimeIRParsingGroupDescription,
                      TimePassesIsEnabled);
@@ -88,12 +90,14 @@ std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
 
   return parseAssembly(Buffer, Err, Context, nullptr,
                        Callbacks.DataLayout.value_or(
-                           [](StringRef, StringRef) { return std::nullopt; }));
+                           [](StringRef, StringRef) { return std::nullopt; }),
+                       ParserContext);
 }
 
 std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
                                           LLVMContext &Context,
-                                          ParserCallbacks Callbacks) {
+                                          ParserCallbacks Callbacks,
+                                          AsmParserContext *ParserContext) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename, /*IsText=*/true);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -102,7 +106,8 @@ std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
     return nullptr;
   }
 
-  return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context, Callbacks);
+  return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context, Callbacks,
+                 ParserContext);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index cbc0b1d..72ae064 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -551,9 +551,11 @@ void llvm::thinLTOInternalizeAndPromoteInIndex(
     function_ref<bool(StringRef, ValueInfo)> isExported,
     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing) {
+  assert(!Index.withInternalizeAndPromote());
   for (auto &I : Index)
     thinLTOInternalizeAndPromoteGUID(Index.getValueInfo(I), isExported,
                                      isPrevailing);
+  Index.setWithInternalizeAndPromote();
 }
 
 // Requires a destructor for std::vector<InputModule>.
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 1bff6cd..f78d9b0 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1512,6 +1512,11 @@ Error IRLinker::run() {
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
+  // Convert module level attributes to function level attributes because
+  // after merging modules the attributes might change and would have different
+  // effect on the functions as the original module would have.
+  copyModuleAttrToFunctions(*SrcM);
+
   std::reverse(Worklist.begin(), Worklist.end());
   while (!Worklist.empty()) {
     GlobalValue *GV = Worklist.back();
@@ -1677,6 +1682,11 @@ IRMover::IRMover(Module &M) : Composite(M) {
   for (const auto *MD : StructTypes.getVisitedMetadata()) {
     SharedMDs[MD].reset(const_cast<MDNode *>(MD));
   }
+
+  // Convert module level attributes to function level attributes because
+  // after merging modules the attributes might change and would have different
+  // effect on the functions as the original module would have.
+  copyModuleAttrToFunctions(M);
 }
 
 Error IRMover::move(std::unique_ptr<Module> Src,
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index e45cac8..048c58d 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1095,6 +1095,31 @@ Expected<MemorySanitizerOptions> parseMSanPassOptions(StringRef Params) {
   return Result;
 }
 
+Expected<AllocTokenOptions> parseAllocTokenPassOptions(StringRef Params) {
+  AllocTokenOptions Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName.consume_front("mode=")) {
+      if (auto Mode = getAllocTokenModeFromString(ParamName))
+        Result.Mode = *Mode;
+      else
+        return make_error<StringError>(
+            formatv("invalid argument to AllocToken pass mode "
+                    "parameter: '{}'",
+                    ParamName)
+                .str(),
+            inconvertibleErrorCode());
+    } else {
+      return make_error<StringError>(
+          formatv("invalid AllocToken pass parameter '{}'", ParamName).str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
 /// Parser of parameters for SimplifyCFG pass.
 Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
   SimplifyCFGOptions Result;
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 884d8da..a66b6e4 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -126,7 +126,6 @@ MODULE_PASS("openmp-opt", OpenMPOptPass())
 MODULE_PASS("openmp-opt-postlink",
             OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink))
 MODULE_PASS("partial-inliner", PartialInlinerPass())
-MODULE_PASS("alloc-token", AllocTokenPass())
 MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
 MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
 MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
@@ -183,6 +182,10 @@ MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
 #define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)
 #endif
 MODULE_PASS_WITH_PARAMS(
+    "alloc-token", "AllocTokenPass",
+    [](AllocTokenOptions Opts) { return AllocTokenPass(Opts); },
+    parseAllocTokenPassOptions, "mode=<mode>")
+MODULE_PASS_WITH_PARAMS(
     "asan", "AddressSanitizerPass",
     [](AddressSanitizerOptions Opts) { return AddressSanitizerPass(Opts); },
     parseASanPassOptions, "kernel;use-after-scope")
diff --git a/llvm/lib/Support/AllocToken.cpp b/llvm/lib/Support/AllocToken.cpp
new file mode 100644
index 0000000..8e9e89f
--- /dev/null
+++ b/llvm/lib/Support/AllocToken.cpp
@@ -0,0 +1,61 @@
+//===- AllocToken.cpp - Allocation Token Calculation ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of AllocToken modes and shared calculation of stateless token IDs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/AllocToken.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SipHash.h"
+
+using namespace llvm;
+
+std::optional<AllocTokenMode>
+llvm::getAllocTokenModeFromString(StringRef Name) {
+  return StringSwitch<std::optional<AllocTokenMode>>(Name)
+      .Case("increment", AllocTokenMode::Increment)
+      .Case("random", AllocTokenMode::Random)
+      .Case("typehash", AllocTokenMode::TypeHash)
+      .Case("typehashpointersplit", AllocTokenMode::TypeHashPointerSplit)
+      .Default(std::nullopt);
+}
+
+static uint64_t getStableHash(const AllocTokenMetadata &Metadata,
+                              uint64_t MaxTokens) {
+  return getStableSipHash(Metadata.TypeName) % MaxTokens;
+}
+
+std::optional<uint64_t> llvm::getAllocToken(AllocTokenMode Mode,
+                                            const AllocTokenMetadata &Metadata,
+                                            uint64_t MaxTokens) {
+  assert(MaxTokens && "Must provide non-zero max tokens");
+
+  switch (Mode) {
+  case AllocTokenMode::Increment:
+  case AllocTokenMode::Random:
+    // Stateful modes cannot be implemented as a pure function.
+    return std::nullopt;
+
+  case AllocTokenMode::TypeHash:
+    return getStableHash(Metadata, MaxTokens);
+
+  case AllocTokenMode::TypeHashPointerSplit: {
+    if (MaxTokens == 1)
+      return 0;
+    const uint64_t HalfTokens = MaxTokens / 2;
+    uint64_t Hash = getStableHash(Metadata, HalfTokens);
+    if (Metadata.ContainsPointer)
+      Hash += HalfTokens;
+    return Hash;
+  }
+  }
+
+  llvm_unreachable("");
+}
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 42b21b5..671a5fe 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -149,6 +149,7 @@ add_llvm_component_library(LLVMSupport
   AArch64BuildAttributes.cpp
   ARMAttributeParser.cpp
   ARMWinEH.cpp
+  AllocToken.cpp
   Allocator.cpp
   AutoConvert.cpp
   Base64.cpp
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 31fcd63..5d9215d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -136,8 +136,8 @@ def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
                (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend))))),
           (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
 def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
-               (am_indexed32 GPR64sp:$Rn, uimm12s8:$offset))))),
-          (LDRSui GPR64sp:$Rn, uimm12s8:$offset)>;
+               (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+          (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
 def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
                (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
           (LDURSi GPR64sp:$Rn, simm9:$offset)>;
@@ -236,11 +236,11 @@ def : Pat<(relaxed_store<atomic_store_32>
 def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
           (STLRX GPR64:$val, GPR64sp:$ptr)>;
 def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
-                                                         ro_Wextend16:$extend),
+                                                         ro_Wextend64:$extend),
                                           GPR64:$val),
           (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
 def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
-                                                         ro_Xextend16:$extend),
+                                                         ro_Xextend64:$extend),
                                           GPR64:$val),
           (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
 def : Pat<(relaxed_store<atomic_store_64>
@@ -276,8 +276,8 @@ def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
                                           (i64 (bitconvert (f64 FPR64Op:$val)))),
           (STRDroX FPR64Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
 def : Pat<(relaxed_store<atomic_store_64>
-              (am_indexed64 GPR64sp:$Rn, uimm12s4:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
-          (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+              (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
+          (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
 def : Pat<(relaxed_store<atomic_store_64>
                (am_unscaled64 GPR64sp:$Rn, simm9:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
           (STURDi FPR64Op:$val, GPR64sp:$Rn, simm9:$offset)>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index fe84193..30b7b03 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -507,7 +507,7 @@ let AddedComplexity = 19 in {
   defm : VecROStoreLane64_0Pat<ro32, store, v2i32, i32, ssub, STRSroW, STRSroX>;
 }
 
-def : Pat<(v8i8 (AArch64dup (i8 (load (am_indexed8 GPR64sp:$Rn))))),
+def : Pat<(v8i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
           (LD1Rv8b GPR64sp:$Rn)>;
 def : Pat<(v16i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
           (LD1Rv16b GPR64sp:$Rn)>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index ef974df..47144c7 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -993,7 +993,7 @@ def PPR_3b : PPRClass<0, 7> { // Restricted 3 bit SVE predicate register class.
   let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PPRRegClassID, 0, 8>";
 }
 def PPR_p8to15 : PPRClass<8, 15> {
-  let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PNRRegClassID, 8, 8>";
+  let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PPRRegClassID, 8, 8>";
 }
 
 def PPRMul2 : PPRClass<0, 14, 2>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2ff2d2f..d930a21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10628,6 +10628,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
     return false;
 
+  const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
+                                  this]() -> bool {
+    if (CmpValue != 0)
+      return false;
+
+    MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
+    if (!Def || Def->getParent() != CmpInstr.getParent())
+      return false;
+
+    const auto foldableSelect = [](MachineInstr *Def) -> bool {
+      if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
+          Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
+        bool Op1IsNonZeroImm =
+            Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
+        bool Op2IsZeroImm =
+            Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
+        if (Op1IsNonZeroImm && Op2IsZeroImm)
+          return true;
+      }
+      return false;
+    };
+
+    // For S_OP that set SCC = DST!=0, do the transformation
+    //
+    //   s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
+
+    // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
+    // for S_CSELECT* already has the same value that will be calculated by
+    // s_cmp_lg_*
+    //
+    //   s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
+    //   imm), 0)
+    if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
+      return false;
+
+    MachineInstr *KillsSCC = nullptr;
+    for (MachineInstr &MI :
+         make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+        return false;
+      if (MI.killsRegister(AMDGPU::SCC, &RI))
+        KillsSCC = &MI;
+    }
+
+    if (MachineOperand *SccDef =
+            Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+      SccDef->setIsDead(false);
+    if (KillsSCC)
+      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+    CmpInstr.eraseFromParent();
+    return true;
+  };
+
   const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
                                this](int64_t ExpectedValue, unsigned SrcSize,
                                      bool IsReversible, bool IsSigned) -> bool {
@@ -10702,16 +10755,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
       return false;
 
-    for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
-         I != E; ++I) {
-      if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
-          I->killsRegister(AMDGPU::SCC, &RI))
+    MachineInstr *KillsSCC = nullptr;
+    for (MachineInstr &MI :
+         make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
         return false;
+      if (MI.killsRegister(AMDGPU::SCC, &RI))
+        KillsSCC = &MI;
     }
 
     MachineOperand *SccDef =
         Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
     SccDef->setIsDead(false);
+    if (KillsSCC)
+      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
     CmpInstr.eraseFromParent();
 
     if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10755,7 +10812,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   case AMDGPU::S_CMP_LG_I32:
   case AMDGPU::S_CMPK_LG_U32:
   case AMDGPU::S_CMPK_LG_I32:
-    return optimizeCmpAnd(0, 32, true, false);
+    return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
   case AMDGPU::S_CMP_GT_U32:
   case AMDGPU::S_CMPK_GT_U32:
     return optimizeCmpAnd(0, 32, false, false);
@@ -10763,7 +10820,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   case AMDGPU::S_CMPK_GT_I32:
     return optimizeCmpAnd(0, 32, false, true);
   case AMDGPU::S_CMP_LG_U64:
-    return optimizeCmpAnd(0, 64, true, false);
+    return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
   }
 
   return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e1d7a07..5fdedda 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -714,6 +714,52 @@ public:
     }
   }
 
+  static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::S_ABSDIFF_I32:
+    case AMDGPU::S_ABS_I32:
+    case AMDGPU::S_AND_B32:
+    case AMDGPU::S_AND_B64:
+    case AMDGPU::S_ANDN2_B32:
+    case AMDGPU::S_ANDN2_B64:
+    case AMDGPU::S_ASHR_I32:
+    case AMDGPU::S_ASHR_I64:
+    case AMDGPU::S_BCNT0_I32_B32:
+    case AMDGPU::S_BCNT0_I32_B64:
+    case AMDGPU::S_BCNT1_I32_B32:
+    case AMDGPU::S_BCNT1_I32_B64:
+    case AMDGPU::S_BFE_I32:
+    case AMDGPU::S_BFE_I64:
+    case AMDGPU::S_BFE_U32:
+    case AMDGPU::S_BFE_U64:
+    case AMDGPU::S_LSHL_B32:
+    case AMDGPU::S_LSHL_B64:
+    case AMDGPU::S_LSHR_B32:
+    case AMDGPU::S_LSHR_B64:
+    case AMDGPU::S_NAND_B32:
+    case AMDGPU::S_NAND_B64:
+    case AMDGPU::S_NOR_B32:
+    case AMDGPU::S_NOR_B64:
+    case AMDGPU::S_NOT_B32:
+    case AMDGPU::S_NOT_B64:
+    case AMDGPU::S_OR_B32:
+    case AMDGPU::S_OR_B64:
+    case AMDGPU::S_ORN2_B32:
+    case AMDGPU::S_ORN2_B64:
+    case AMDGPU::S_QUADMASK_B32:
+    case AMDGPU::S_QUADMASK_B64:
+    case AMDGPU::S_WQM_B32:
+    case AMDGPU::S_WQM_B64:
+    case AMDGPU::S_XNOR_B32:
+    case AMDGPU::S_XNOR_B64:
+    case AMDGPU::S_XOR_B32:
+    case AMDGPU::S_XOR_B64:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   static bool isEXP(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::EXP;
   }
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 1f773e2..3368a50 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -820,7 +820,7 @@ void ARMAsmPrinter::emitAttributes() {
 
     auto *BTIValue = mdconst::extract_or_null<ConstantInt>(
         SourceModule->getModuleFlag("branch-target-enforcement"));
-    if (BTIValue && BTIValue->isOne()) {
+    if (BTIValue && !BTIValue->isZero()) {
       // If "+pacbti" is used as an architecture extension,
       // Tag_BTI_extension is emitted in
       // ARMTargetStreamer::emitTargetAttributes().
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 406f4c1..597d311 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1036,6 +1036,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
   while (!Worklist.empty()) {
     MachineInstr *MI = Worklist.pop_back_val();
     if (MI->getOpcode() == ARM::MQPRCopy) {
+      LLVM_DEBUG(dbgs() << " Must generate copy as VMOV: " << *MI);
       VMOVCopies.insert(MI);
       MachineInstr *CopySrc =
           RDI.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
@@ -1045,6 +1046,20 @@ bool LowOverheadLoop::ValidateLiveOuts() {
       LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
       VMOVCopies.clear();
       return false;
+    } else if (isVectorPredicated(MI)) {
+      // If this is a predicated instruction with merging semantics,
+      // check where it gets its false lanes from, if any.
+      int InactiveIdx = findVPTInactiveOperandIdx(*MI);
+      if (InactiveIdx != -1) {
+        SmallPtrSet<MachineInstr *, 2> Defs;
+        MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef(
+            MI, MI->getOperand(InactiveIdx).getReg());
+        if (FalseSrc) {
+          LLVM_DEBUG(dbgs()
+                     << " Must check source of false lanes for: " << *MI);
+          Worklist.push_back(FalseSrc);
+        }
+      }
     }
   }
 
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 431ce38..f5653d4 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -805,6 +805,16 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) {
   return -1;
 }
 
+int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
+    if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R)
+      return i + ARM::SUBOP_vpred_r_inactive;
+
+  return -1;
+}
+
 ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
                                             Register &PredReg) {
   int PIdx = findFirstVPTPredOperandIdx(MI);
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 3ec3a621..1b0bf2d 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -90,6 +90,9 @@ inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) {
   Register PredReg;
   return getVPTInstrPredicate(MI, PredReg);
 }
+// Identify the input operand in an MVE predicated instruction which
+// contributes the values of any inactive vector lanes.
+int findVPTInactiveOperandIdx(const MachineInstr &MI);
 
 // Recomputes the Block Mask of Instr, a VPT or VPST instruction.
 // This rebuilds the block mask of the instruction depending on the predicates
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 9b5fc9d..a652b7e 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -95,7 +95,24 @@ void BTFTypeDerived::completeType(BTFDebug &BDebug) {
     return;
   IsCompleted = true;
 
-  BTFType.NameOff = BDebug.addString(Name);
+  switch (Kind) {
+  case BTF::BTF_KIND_PTR:
+  case BTF::BTF_KIND_CONST:
+  case BTF::BTF_KIND_VOLATILE:
+  case BTF::BTF_KIND_RESTRICT:
+    // Debug info might contain names for these types, but given that we want
+    // to keep BTF minimal and naming reference types doesn't bring any value
+    // (what matters is the completeness of the base type), we don't emit them.
+    //
+    // Furthermore, the Linux kernel refuses to load BPF programs that contain
+    // BTF with these types named:
+    // https://elixir.bootlin.com/linux/v6.17.1/source/kernel/bpf/btf.c#L2586
+    BTFType.NameOff = 0;
+    break;
+  default:
+    BTFType.NameOff = BDebug.addString(Name);
+    break;
+  }
 
   if (NeedsFixup || !DTy)
     return;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d477522..17f04d0 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14736,8 +14736,8 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
 }
 
 unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
-  // Note: This functionality is used only when unsafe-fp-math is enabled, and
-  // on cores with reciprocal estimates (which are used when unsafe-fp-math is
+  // Note: This functionality is used only when arcp is enabled, and
+  // on cores with reciprocal estimates (which are used when arcp is
   // enabled for division), this functionality is redundant with the default
   // combiner logic (once the division -> reciprocal/multiply transformation
   // has taken place). As a result, this matters more for older cores than for
diff --git a/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp b/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
index f7fb886..3ca0b40 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ReplaceConstant.h"
 
 #define DEBUG_TYPE "spirv-cbuffer-access"
 using namespace llvm;
@@ -57,6 +58,12 @@ static bool replaceCBufferAccesses(Module &M) {
   if (!CBufMD)
     return false;
 
+  SmallVector<Constant *> CBufferGlobals;
+  for (const hlsl::CBufferMapping &Mapping : *CBufMD)
+    for (const hlsl::CBufferMember &Member : Mapping.Members)
+      CBufferGlobals.push_back(Member.GV);
+  convertUsersOfConstantsToInstructions(CBufferGlobals);
+
   for (const hlsl::CBufferMapping &Mapping : *CBufMD) {
     Instruction *HandleDef = findHandleDef(Mapping.Handle);
     if (!HandleDef) {
@@ -80,12 +87,7 @@ static bool replaceCBufferAccesses(Module &M) {
       Value *GetPointerCall = Builder.CreateIntrinsic(
           PtrType, Intrinsic::spv_resource_getpointer, {HandleDef, IndexVal});
 
-      // We cannot use replaceAllUsesWith here because some uses may be
-      // ConstantExprs, which cannot be replaced with non-constants.
-      SmallVector<User *, 4> Users(MemberGV->users());
-      for (User *U : Users) {
-        U->replaceUsesOfWith(MemberGV, GetPointerCall);
-      }
+      MemberGV->replaceAllUsesWith(GetPointerCall);
     }
   }
 
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index cf85691..9bda8a4 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -158,7 +158,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
     Options.X = F.getFnAttribute(Y).getValueAsBool();     \
   } while (0)
 
-  RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
   RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
   RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f973949..7ec463b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -183,6 +183,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       for (auto T : {MVT::i32, MVT::i64})
         setOperationAction(Op, T, Custom);
 
+  if (Subtarget->hasRelaxedSIMD()) {
+    setOperationAction(
+        {ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXNUM, ISD::FMAXIMUMNUM},
+        {MVT::v4f32, MVT::v2f64}, Legal);
+  }
   // SIMD-specific configuration
   if (Subtarget->hasSIMD128()) {
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 7840620..f0ac26b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1742,6 +1742,23 @@ defm SIMD_RELAXED_FMIN :
 defm SIMD_RELAXED_FMAX :
    RelaxedBinary<F64x2, int_wasm_relaxed_max, "relaxed_max", 0x110>;
 
+let Predicates = [HasRelaxedSIMD] in {
+  foreach vec = [F32x4, F64x2] in {
+    defvar relaxed_min = !cast<NI>("SIMD_RELAXED_FMIN_"#vec);
+    defvar relaxed_max = !cast<NI>("SIMD_RELAXED_FMAX_"#vec);
+
+    // Transform standard fminimum/fmaximum to relaxed versions
+    def : Pat<(vec.vt (fminnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+              (relaxed_min V128:$lhs, V128:$rhs)>;
+    def : Pat<(vec.vt (fminimumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+              (relaxed_min V128:$lhs, V128:$rhs)>;
+    def : Pat<(vec.vt (fmaxnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+              (relaxed_max V128:$lhs, V128:$rhs)>;
+    def : Pat<(vec.vt (fmaximumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+              (relaxed_max V128:$lhs, V128:$rhs)>;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Relaxed rounding q15 multiplication
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b5f8ee5..d49f25a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20558,7 +20558,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
 
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   // NOTE: By using fsub of a positive constant instead of fadd of a negative
-  // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
+  // constant, we avoid reassociation in MachineCombiner when reassoc is
   // enabled. See PR24512.
   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
   // TODO: Are there any fast-math-flags to propagate here?
@@ -29516,11 +29516,8 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
   if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
     return MinMax;
 
-  if (DAG.isKnownNeverNaN(NewX))
-    NewX = NewY;
-
-  SDValue IsNaN =
-      DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
+  SDValue NaNSrc = IsNum ? MinMax : NewX;
+  SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO);
 
   return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
 }
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 83bd6ac..1b748b7 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -5519,7 +5519,7 @@ defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs,
 defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs,
                                SchedWriteFCmpSizes, 0>;
 
-// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
+// MIN/MAX nodes are commutable under (nnan + ninf). In this case we use
 // X86fminc and X86fmaxc instead of X86fmin and X86fmax
 multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
                                     X86VectorVTInfo _, SDNode OpNode,
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index cc30054..ac4d31d 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -15,7 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 def Znver4Model : SchedMachineModel {
-  // AMD SOG Zen4, 2.9.6 Dispatch
+  // AMD SOG Zen4, 2.9.8 Dispatch
   // The processor may dispatch up to 6 macro ops per cycle
   // into the execution engine.
   let IssueWidth = 6;
@@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel {
   int VecLoadLatency = 7;
   // Latency of a simple store operation.
   int StoreLatency = 1;
-  // FIXME:
-  let HighLatency = 25; // FIXME: any better choice?
+  // Mean and median value for all instructions with latencies >6
+  // Source: Zen4 Instruction Latencies spreadsheet (included with SOG)
+  let HighLatency = 13;
   // AMD SOG Zen4, 2.8 Optimizing Branching
   // The branch misprediction penalty is in the range from 11 to 18 cycles,
   // <...>. The common case penalty is 13 cycles.
@@ -612,6 +613,7 @@ def Zn4WriteLEA : SchedWriteVariant<[
 
 def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
 
+// values from uops.info
 def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
   let Latency = 2; // FIXME: not from llvm-exegesis
   let ReleaseAtCycles = [4];
@@ -659,15 +661,15 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
 
 def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
   let Latency = 3; // FIXME: not from llvm-exegesis
-  let ReleaseAtCycles = [24];
-  let NumMicroOps = 19;
+  let ReleaseAtCycles = [20];
+  let NumMicroOps = 15;
 }
 def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
 
 def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
-  let Latency = 4; // FIXME: not from llvm-exegesis
-  let ReleaseAtCycles = [59];
-  let NumMicroOps = 28;
+  let Latency = 2; // FIXME: not from llvm-exegesis
+  let ReleaseAtCycles = [40];
+  let NumMicroOps = 26;
 }
 def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
 
@@ -681,7 +683,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a
 def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
   let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
   let ReleaseAtCycles = [1, 1, 2];
-  let NumMicroOps = 5;
+  let NumMicroOps = 2;
 }
 def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
 
@@ -693,19 +695,17 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]>
 def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
 
 // Integer division.
-// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
-// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
-defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
-
-defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
-defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
+defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;
+
+defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward.
+defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse.
 
 defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
 
@@ -725,12 +725,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
 }
 def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
 
-defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
+defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1, [1], 1>; // Trailing zero count.
 
 def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
-  let Latency = 2;
-  let ReleaseAtCycles = [4];
-  let NumMicroOps = 2;
+  let Latency = 1;
+  let ReleaseAtCycles = [1];
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
 
@@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
 }
 def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
 
-def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
-  // TODO: All align instructions are expected to be of 4 cycle latency
-  let Latency = 4;
+// 128-bit VALIGN
+def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+  let Latency = 2;
   let ReleaseAtCycles = [1];
   let NumMicroOps = 1;
 }
-def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
-                                            VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
-                                            >;
+
+// 256-bit VALIGN
+def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+  let Latency = 3;
+  let ReleaseAtCycles = [1];
+  let NumMicroOps = 1;
+}
+
+// 512-bit VALIGN
+def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+  let Latency = 4;
+  let ReleaseAtCycles = [2];
+  let NumMicroOps = 1;
+}
+
+def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>;
+def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>;
+def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>;
+
 defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
 
 def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
@@ -1326,9 +1342,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
 
 // Strings instructions.
 // Packed Compare Implicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
+defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7, [8], 3, /*LoadUOps=*/1>;
 // Packed Compare Explicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
+defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7, [12], 7, /*LoadUOps=*/5>;
 // Packed Compare Implicit Length Strings, Return Index
 defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
 // Packed Compare Explicit Length Strings, Return Index
@@ -1340,7 +1356,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn
 defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
 
 // Carry-less multiplication instructions.
-defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
+defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3], 4>;
 
 // EMMS/FEMMS
 defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
@@ -1386,44 +1402,44 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
 def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
 
 def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
-  let Latency = 7;
+  let Latency = 4;
   let ReleaseAtCycles = [1];
-  let NumMicroOps = 2;
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
 
 def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
   let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
-  let ReleaseAtCycles = [1, 1, 2];
-  let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
+  let ReleaseAtCycles = [1, 1, 1];
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
 
 def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
-  let Latency = 6;
+  let Latency = 4;
   let ReleaseAtCycles = [1];
-  let NumMicroOps = 2;
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
 
 def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
   let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency);
-  let ReleaseAtCycles = [1, 1, 2];
-  let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
+  let ReleaseAtCycles = [1, 1, 1];
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
 
 def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
-  let Latency = 5;
+  let Latency = 4;
   let ReleaseAtCycles = [1];
-  let NumMicroOps = 2;
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
 
 def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
   let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency);
-  let ReleaseAtCycles = [1, 1, 2];
-  let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
+  let ReleaseAtCycles = [1, 1, 1];
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
 
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index bbbac45..7a95df4 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -907,10 +907,20 @@ static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts,
   StoreInst *Store = Builder.CreateAlignedStore(
       Val, First.Store->getPointerOperand(), First.Store->getAlign());
 
+  // Merge various metadata onto the new store.
   AAMDNodes AATags = First.Store->getAAMetadata();
-  for (const PartStore &Part : drop_begin(Parts))
+  SmallVector<Instruction *> Stores = {First.Store};
+  Stores.reserve(Parts.size());
+  SmallVector<DebugLoc> DbgLocs = {First.Store->getDebugLoc()};
+  DbgLocs.reserve(Parts.size());
+  for (const PartStore &Part : drop_begin(Parts)) {
     AATags = AATags.concat(Part.Store->getAAMetadata());
+    Stores.push_back(Part.Store);
+    DbgLocs.push_back(Part.Store->getDebugLoc());
+  }
   Store->setAAMetadata(AATags);
+  Store->mergeDIAssignID(Stores);
+  Store->setDebugLoc(DebugLoc::getMergedLocations(DbgLocs));
 
   // Remove the old stores.
   for (const PartStore &Part : Parts)
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 46fb567..aa1346d 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1271,7 +1271,7 @@ bool LowerTypeTestsModule::hasBranchTargetEnforcement() {
     // the module flags.
     if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
           M.getModuleFlag("branch-target-enforcement")))
-      HasBranchTargetEnforcement = (BTE->getZExtValue() != 0);
+      HasBranchTargetEnforcement = !BTE->isZero();
     else
       HasBranchTargetEnforcement = 0;
   }
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 76e588b..a0f7ec6 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -24,7 +24,8 @@
 //   returns 0, or a single vtable's function returns 1, replace each virtual
 //   call with a comparison of the vptr against that vtable's address.
 //
-// This pass is intended to be used during the regular and thin LTO pipelines:
+// This pass is intended to be used during the regular/thin and non-LTO
+// pipelines:
 //
 // During regular LTO, the pass determines the best optimization for each
 // virtual call and applies the resolutions directly to virtual calls that are
@@ -48,6 +49,14 @@
 //   is supported.
 // - Import phase: (same as with hybrid case above).
 //
+// During Speculative devirtualization mode -not restricted to LTO-:
+// - The pass applies speculative devirtualization without requiring any type of
+//   visibility.
+// - Skips other features like virtual constant propagation, uniform return
+//   value optimization, unique return value optimization and branch funnels as
+//   they need LTO.
+// - This mode is enabled via 'devirtualize-speculatively' flag.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
@@ -61,7 +70,9 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
@@ -145,6 +156,13 @@ static cl::opt<std::string> ClWriteSummary(
              "bitcode, otherwise YAML"),
     cl::Hidden);
 
+// TODO: This option eventually should support any public visibility vtables
+// with/out LTO.
+static cl::opt<bool> ClDevirtualizeSpeculatively(
+    "devirtualize-speculatively",
+    cl::desc("Enable speculative devirtualization optimization"),
+    cl::init(false));
+
 static cl::opt<unsigned>
     ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden,
                 cl::init(10),
@@ -892,6 +910,8 @@ void llvm::updatePublicTypeTestCalls(Module &M,
       CI->eraseFromParent();
     }
   } else {
+    // TODO: Don't replace public type tests when speculative devirtualization
+    // gets enabled in LTO mode.
     auto *True = ConstantInt::getTrue(M.getContext());
     for (Use &U : make_early_inc_range(PublicTypeTestFunc->uses())) {
       auto *CI = cast<CallInst>(U.getUser());
@@ -1083,10 +1103,10 @@ bool DevirtModule::tryFindVirtualCallTargets(
     if (!TM.Bits->GV->isConstant())
       return false;
 
-    // We cannot perform whole program devirtualization analysis on a vtable
-    // with public LTO visibility.
-    if (TM.Bits->GV->getVCallVisibility() ==
-        GlobalObject::VCallVisibilityPublic)
+    // Without ClDevirtualizeSpeculatively, we cannot perform whole program
+    // devirtualization analysis on a vtable with public LTO visibility.
+    if (!ClDevirtualizeSpeculatively && TM.Bits->GV->getVCallVisibility() ==
+                                            GlobalObject::VCallVisibilityPublic)
       return false;
 
     Function *Fn = nullptr;
@@ -1105,6 +1125,12 @@ bool DevirtModule::tryFindVirtualCallTargets(
     if (Fn->getName() == "__cxa_pure_virtual")
       continue;
 
+    // In most cases empty functions will be overridden by the
+    // implementation of the derived class, so we can skip them.
+    if (ClDevirtualizeSpeculatively && Fn->getReturnType()->isVoidTy() &&
+        Fn->getInstructionCount() <= 1)
+      continue;
+
     // We can disregard unreachable functions as possible call targets, as
     // unreachable functions shouldn't be called.
     if (mustBeUnreachableFunction(Fn, ExportSummary))
@@ -1223,10 +1249,12 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
         CallTrap->setDebugLoc(CB.getDebugLoc());
       }
 
-      // If fallback checking is enabled, add support to compare the virtual
-      // function pointer to the devirtualized target. In case of a mismatch,
-      // fall back to indirect call.
-      if (DevirtCheckMode == WPDCheckMode::Fallback) {
+      // If fallback checking or speculative devirtualization are enabled,
+      // add support to compare the virtual function pointer to the
+      // devirtualized target. In case of a mismatch, fall back to indirect
+      // call.
+      if (DevirtCheckMode == WPDCheckMode::Fallback ||
+          ClDevirtualizeSpeculatively) {
         MDNode *Weights = MDBuilder(M.getContext()).createLikelyBranchWeights();
         // Version the indirect call site. If the called value is equal to the
         // given callee, 'NewInst' will be executed, otherwise the original call
@@ -2057,15 +2085,15 @@ void DevirtModule::scanTypeTestUsers(
     Function *TypeTestFunc,
     DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
   // Find all virtual calls via a virtual table pointer %p under an assumption
-  // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
-  // points to a member of the type identifier %md. Group calls by (type ID,
-  // offset) pair (effectively the identity of the virtual function) and store
-  // to CallSlots.
+  // of the form llvm.assume(llvm.type.test(%p, %md)) or
+  // llvm.assume(llvm.public.type.test(%p, %md)).
+  // This indicates that %p points to a member of the type identifier %md.
+  // Group calls by (type ID, offset) pair (effectively the identity of the
+  // virtual function) and store to CallSlots.
   for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
     auto *CI = dyn_cast<CallInst>(U.getUser());
     if (!CI)
       continue;
-
     // Search for virtual calls based on %p and add them to DevirtCalls.
     SmallVector<DevirtCallSite, 1> DevirtCalls;
     SmallVector<CallInst *, 1> Assumes;
@@ -2348,6 +2376,12 @@ bool DevirtModule::run() {
       (ImportSummary && ImportSummary->partiallySplitLTOUnits()))
     return false;
 
+  Function *PublicTypeTestFunc = nullptr;
+  // If we are in speculative devirtualization mode, we can work on the public
+  // type test intrinsics.
+  if (ClDevirtualizeSpeculatively)
+    PublicTypeTestFunc =
+        Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
   Function *TypeTestFunc =
       Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
   Function *TypeCheckedLoadFunc =
@@ -2361,8 +2395,9 @@ bool DevirtModule::run() {
   // module, this pass has nothing to do. But if we are exporting, we also need
   // to handle any users that appear only in the function summaries.
   if (!ExportSummary &&
-      (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
-       AssumeFunc->use_empty()) &&
+      (((!PublicTypeTestFunc || PublicTypeTestFunc->use_empty()) &&
+        (!TypeTestFunc || TypeTestFunc->use_empty())) ||
+       !AssumeFunc || AssumeFunc->use_empty()) &&
       (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()) &&
       (!TypeCheckedLoadRelativeFunc ||
        TypeCheckedLoadRelativeFunc->use_empty()))
@@ -2373,6 +2408,9 @@ bool DevirtModule::run() {
   DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
   buildTypeIdentifierMap(Bits, TypeIdMap);
 
+  if (PublicTypeTestFunc && AssumeFunc)
+    scanTypeTestUsers(PublicTypeTestFunc, TypeIdMap);
+
   if (TypeTestFunc && AssumeFunc)
     scanTypeTestUsers(TypeTestFunc, TypeIdMap);
 
@@ -2472,8 +2510,12 @@ bool DevirtModule::run() {
                  .WPDRes[S.first.ByteOffset];
     if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos,
                                   S.first.ByteOffset, ExportSummary)) {
-
-      if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
+      bool SingleImplDevirt =
+          trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res);
+      // Out of speculative devirtualization mode, Try to apply virtual constant
+      // propagation or branch funneling.
+      // TODO: This should eventually be enabled for non-public type tests.
+      if (!SingleImplDevirt && !ClDevirtualizeSpeculatively) {
         DidVirtualConstProp |=
             tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);
 
diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
index 29968b8..8181e4e 100644
--- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/AllocToken.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -54,47 +55,14 @@
 #include <variant>
 
 using namespace llvm;
+using TokenMode = AllocTokenMode;
 
 #define DEBUG_TYPE "alloc-token"
 
 namespace {
 
-//===--- Constants --------------------------------------------------------===//
-
-enum class TokenMode : unsigned {
-  /// Incrementally increasing token ID.
-  Increment = 0,
-
-  /// Simple mode that returns a statically-assigned random token ID.
-  Random = 1,
-
-  /// Token ID based on allocated type hash.
-  TypeHash = 2,
-
-  /// Token ID based on allocated type hash, where the top half ID-space is
-  /// reserved for types that contain pointers and the bottom half for types
-  /// that do not contain pointers.
-  TypeHashPointerSplit = 3,
-};
-
 //===--- Command-line options ---------------------------------------------===//
 
-cl::opt<TokenMode> ClMode(
-    "alloc-token-mode", cl::Hidden, cl::desc("Token assignment mode"),
-    cl::init(TokenMode::TypeHashPointerSplit),
-    cl::values(
-        clEnumValN(TokenMode::Increment, "increment",
-                   "Incrementally increasing token ID"),
-        clEnumValN(TokenMode::Random, "random",
-                   "Statically-assigned random token ID"),
-        clEnumValN(TokenMode::TypeHash, "typehash",
-                   "Token ID based on allocated type hash"),
-        clEnumValN(
-            TokenMode::TypeHashPointerSplit, "typehashpointersplit",
-            "Token ID based on allocated type hash, where the top half "
-            "ID-space is reserved for types that contain pointers and the "
-            "bottom half for types that do not contain pointers. ")));
-
 cl::opt<std::string> ClFuncPrefix("alloc-token-prefix",
                                   cl::desc("The allocation function prefix"),
                                   cl::Hidden, cl::init("__alloc_token_"));
@@ -217,22 +185,19 @@ public:
   using ModeBase::ModeBase;
 
   uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
-    const auto [N, H] = getHash(CB, ORE);
-    return N ? boundedToken(H) : H;
-  }
 
-protected:
-  std::pair<MDNode *, uint64_t> getHash(const CallBase &CB,
-                                        OptimizationRemarkEmitter &ORE) {
     if (MDNode *N = getAllocTokenMetadata(CB)) {
       MDString *S = cast<MDString>(N->getOperand(0));
-      return {N, getStableSipHash(S->getString())};
+      AllocTokenMetadata Metadata{S->getString(), containsPointer(N)};
+      if (auto Token = getAllocToken(TokenMode::TypeHash, Metadata, MaxTokens))
+        return *Token;
     }
     // Fallback.
     remarkNoMetadata(CB, ORE);
-    return {nullptr, ClFallbackToken};
+    return ClFallbackToken;
   }
 
+protected:
   /// Remark that there was no precise type information.
   static void remarkNoMetadata(const CallBase &CB,
                                OptimizationRemarkEmitter &ORE) {
@@ -253,20 +218,18 @@ public:
   using TypeHashMode::TypeHashMode;
 
   uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
-    if (MaxTokens == 1)
-      return 0;
-    const uint64_t HalfTokens = MaxTokens / 2;
-    const auto [N, H] = getHash(CB, ORE);
-    if (!N) {
-      // Pick the fallback token (ClFallbackToken), which by default is 0,
-      // meaning it'll fall into the pointer-less bucket. Override by setting
-      // -alloc-token-fallback if that is the wrong choice.
-      return H;
+    if (MDNode *N = getAllocTokenMetadata(CB)) {
+      MDString *S = cast<MDString>(N->getOperand(0));
+      AllocTokenMetadata Metadata{S->getString(), containsPointer(N)};
+      if (auto Token = getAllocToken(TokenMode::TypeHashPointerSplit, Metadata,
+                                     MaxTokens))
+        return *Token;
     }
-    uint64_t Hash = H % HalfTokens; // base hash
-    if (containsPointer(N))
-      Hash += HalfTokens;
-    return Hash;
+    // Pick the fallback token (ClFallbackToken), which by default is 0, meaning
+    // it'll fall into the pointer-less bucket. Override by setting
+    // -alloc-token-fallback if that is the wrong choice.
+    remarkNoMetadata(CB, ORE);
+    return ClFallbackToken;
   }
 };
 
@@ -286,7 +249,7 @@ public:
       : Options(transformOptionsFromCl(std::move(Opts))), Mod(M),
         FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
         Mode(IncrementMode(*IntPtrTy, *Options.MaxTokens)) {
-    switch (ClMode.getValue()) {
+    switch (Options.Mode) {
     case TokenMode::Increment:
       break;
     case TokenMode::Random:
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index d18c0d0..80e77e09 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -2020,7 +2020,6 @@ static void moveFastMathFlags(Function &F,
     F.removeFnAttr(attr);                                                      \
     FMF.set##setter();                                                         \
   }
-  MOVE_FLAG("unsafe-fp-math", Fast)
   MOVE_FLAG("no-infs-fp-math", NoInfs)
   MOVE_FLAG("no-nans-fp-math", NoNaNs)
   MOVE_FLAG("no-signed-zeros-fp-math", NoSignedZeros)
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 8714741a..9829d4d 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1793,3 +1793,13 @@ bool llvm::hasOnlySimpleTerminator(const Function &F) {
   }
   return true;
 }
+
+Printable llvm::printBasicBlock(const BasicBlock *BB) {
+  return Printable([BB](raw_ostream &OS) {
+    if (!BB) {
+      OS << "<nullptr>";
+      return;
+    }
+    BB->printAsOperand(OS);
+  });
+}
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 978d5a2..371d9e6 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -260,9 +260,16 @@ bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
   // next to the defs they must go with so that we can know it's time to pop
   // the stack when we hit the end of the phi uses for a given def.
   const ValueDFS &Top = *Stack.back().V;
-  if (Top.LocalNum == LN_Last && Top.PInfo) {
-    if (!VDUse.U)
-      return false;
+  assert(Top.PInfo && "RenameStack should only contain predicate infos (defs)");
+  if (Top.LocalNum == LN_Last) {
+    if (!VDUse.U) {
+      assert(VDUse.PInfo && "A non-use VDUse should have a predicate info");
+      // We should reserve adjacent LN_Last defs for the same phi use.
+      return VDUse.LocalNum == LN_Last &&
+             // If the two phi defs have the same edge, they must be designated
+             // for the same succ BB.
+             getBlockEdge(Top.PInfo) == getBlockEdge(VDUse.PInfo);
+    }
     auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
     if (!PHI)
       return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index adf27be..d2c100c9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9860,6 +9860,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
+  if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
+    UserIC = 1;
 
   // Plan how to best vectorize.
   LVP.plan(UserVF, UserIC);
@@ -9924,7 +9926,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     VectorizeLoop = false;
   }
 
-  if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
+  if (UserIC == 1 && Hints.getInterleave() > 1) {
+    assert(!LVL.isSafeForAnyVectorWidth() &&
+           "UserIC should only be ignored due to unsafe dependencies");
+    LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
+    IntDiagMsg = {"InterleavingUnsafe",
+                  "Ignoring user-specified interleave count due to possibly "
+                  "unsafe dependencies in the loop."};
+    InterleaveLoop = false;
+  } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
     // Tell the user interleaving was avoided up-front, despite being explicitly
     // requested.
     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ff25ef5..48cf763 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4051,7 +4051,7 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
 static std::optional<ElementCount> isConsecutiveInterleaveGroup(
     VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
     VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
-  if (!InterleaveR)
+  if (!InterleaveR || InterleaveR->getMask())
     return std::nullopt;
 
   Type *GroupElementTy = nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 0678bc90..83e3fca 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -41,10 +41,10 @@ class VPRecipeBase;
 class VPInterleaveBase;
 class VPPhiAccessors;
 
-// This is the base class of the VPlan Def/Use graph, used for modeling the data
-// flow into, within and out of the VPlan. VPValues can stand for live-ins
-// coming from the input IR and instructions which VPlan will generate if
-// executed.
+/// This is the base class of the VPlan Def/Use graph, used for modeling the
+/// data flow into, within and out of the VPlan. VPValues can stand for live-ins
+/// coming from the input IR and instructions which VPlan will generate if
+/// executed.
 class LLVM_ABI_FOR_TEST VPValue {
   friend class VPDef;
   friend struct VPDoubleValueDef;
@@ -57,7 +57,7 @@ class LLVM_ABI_FOR_TEST VPValue {
   SmallVector<VPUser *, 1> Users;
 
 protected:
-  // Hold the underlying Value, if any, attached to this VPValue.
+  /// Hold the underlying Value, if any, attached to this VPValue.
   Value *UnderlyingVal;
 
   /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the
diff --git a/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll
new file mode 100644
index 0000000..1de8ab5
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll
@@ -0,0 +1,30 @@
+; RUN: opt %s -aa-pipeline=basic-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; BasicAA should prove that loads from sufficiently large static offsets
+; don't overlap with matrix loads with a statically known size.
+
+define <8 x double> @non_overlapping_strided_load(ptr %src) {
+; CHECK-LABEL: Function: non_overlapping_strided_load:
+; Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+; Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 12
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  ret <8 x double> %l
+}
+
+define <8 x double> @overlapping_strided_load(ptr %src) {
+; CHECK-LABEL: Function: overlapping_strided_load:
+; CHECK: Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 11
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  ret <8 x double> %l
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll
new file mode 100644
index 0000000..458bd2e
--- /dev/null
+++ b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll
@@ -0,0 +1,49 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as < %t/masked-store.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE
+; RUN: not llvm-as < %t/masked-store-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE-ZERO
+; RUN: not llvm-as < %t/masked-load.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD
+; RUN: not llvm-as < %t/masked-load-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD-ZERO
+; RUN: not llvm-as < %t/masked-scatter.ll 2>&1 | FileCheck %s --check-prefix=MASKED-SCATTER
+; RUN: not llvm-as < %t/masked-gather.ll 2>&1 | FileCheck %s --check-prefix=MASKED-GATHER
+
+;--- masked-store.ll
+; MASKED-STORE: LLVM ERROR: Invalid alignment argument
+define void @masked_store(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 3, <2 x i1> %mask)
+  ret void
+}
+
+;--- masked-store-zero.ll
+; MASKED-STORE-ZERO: LLVM ERROR: Invalid zero alignment argument
+define void @masked_store_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 0, <2 x i1> %mask)
+  ret void
+}
+
+;--- masked-load.ll
+; MASKED-LOAD: LLVM ERROR: Invalid alignment argument
+define void @masked_load(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 3, <2 x i1> %mask, <2 x double> %val)
+  ret void
+}
+
+;--- masked-load-zero.ll
+; MASKED-LOAD-ZERO: LLVM ERROR: Invalid zero alignment argument
+define void @masked_load_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 0, <2 x i1> %mask, <2 x double> %val)
+  ret void
+}
+
+;--- masked-scatter.ll
+; MASKED-SCATTER: LLVM ERROR: Invalid alignment argument
+define void @masked_scatter(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call void @llvm.masked.scatter.v2f64.p0(<2 x double> %val, <2 x ptr> %ptr, i32 3, <2 x i1> %mask)
+  ret void
+}
+
+;--- masked-gather.ll
+; MASKED-GATHER: LLVM ERROR: Invalid alignment argument
+define void @masked_gather(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call <2 x double> @llvm.masked.gather.v2f64.p0(<2 x ptr> %ptr, i32 3, <2 x i1> %mask, <2 x double> %val)
+  ret void
+}
diff --git a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll b/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
deleted file mode 100644
index 00c0131..0000000
--- a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; REQUIRES: x86-registered-target
-; RUN: opt -module-summary %s -o %t.o
-
-; Ensure dead stripping performed flag is set on distributed index
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN:		-r %t.o,glob,plx
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=WITHDEAD
-; WITHDEAD: <FLAGS op0=97/>
-
-; Ensure dead stripping performed flag is not set on distributed index
-; when option used to disable dead stripping computation.
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN:		-r %t.o,glob,plx -compute-dead=false
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
-; NODEAD: <FLAGS op0=96/>
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@glob = global i32 0
diff --git a/llvm/test/Bitcode/thinlto-index-flags.ll b/llvm/test/Bitcode/thinlto-index-flags.ll
new file mode 100644
index 0000000..e957ce6
--- /dev/null
+++ b/llvm/test/Bitcode/thinlto-index-flags.ll
@@ -0,0 +1,39 @@
+; REQUIRES: x86-registered-target
+; RUN: opt -module-summary %s -o %t.o
+
+;; By default, the indexing step should perform and set the appropriate index
+;; flags for dead stripping, attribute propagation, DSO local propagation,
+;; and internalization/promotion.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=ALL
+;; The flag value should be 0x461 aka 1121:
+;; 0x1: Dead stripping
+;; 0x20: Attribute propagation
+;; 0x40: DSO local propagation
+;; 0x400: Internalization/promotion
+; ALL: <FLAGS op0=1121/>
+
+;; Ensure dead stripping performed flag is not set on distributed index
+;; when option used to disable dead stripping computation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx -compute-dead=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
+;; Flag should be 0x460 aka 1120.
+; NODEAD: <FLAGS op0=1120/>
+
+;; Disabling attribute propagation should disable that as well as DSO local
+;; propagation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx -propagate-attrs=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NOPROP
+;; Flag should be 0x401 aka 1025.
+; NOPROP: <FLAGS op0=1025/>
+
+;; Note there isn't currently a way to disable internalization+promotion, which
+;; are performed together.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@glob = global i32 0
diff --git a/llvm/test/Bitcode/upgrade-branch-protection.ll b/llvm/test/Bitcode/upgrade-branch-protection.ll
index 1b33e39..6f60ba5 100644
--- a/llvm/test/Bitcode/upgrade-branch-protection.ll
+++ b/llvm/test/Bitcode/upgrade-branch-protection.ll
@@ -1,8 +1,11 @@
-;; Test that module flags "branch-target-enforcement" and "sign-return-address"  can be upgraded to
-;; are upgraded from Error to Min.
+;; Test that module flags "branch-target-enforcement" and "sign-return-address"
+;; can be upgraded to are upgraded from Error to Min and the value is changed 2
+;; as the module is converted to the semantic.
 
 ; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s
 
+target triple = "aarch64-unknown-linux-gnu"
+
 !llvm.module.flags = !{!0, !1, !2, !3}
 
 !0 = !{i32 1, !"branch-target-enforcement", i32 1}
@@ -10,7 +13,7 @@
 !2 = !{i32 1, !"sign-return-address-all", i32 1}
 !3 = !{i32 1, !"sign-return-address-with-bkey", i32 1}
 
-;CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 1}
-;CHECK: !1 = !{i32 8, !"sign-return-address", i32 1}
-;CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 1}
-;CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 1}
-\ No newline at end of file
+;CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 2}
+;CHECK: !1 = !{i32 8, !"sign-return-address", i32 2}
+;CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 2}
+;CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 2}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
index 97a0417..b040ff2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
@@ -56,7 +56,7 @@
   }
 
 
-  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
   attributes #2 = { optsize }
   attributes #3 = { minsize }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
index fc4fbac..f24aeae 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
@@ -47,7 +47,7 @@
     ret void
   }
 
-  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
index b06cadf..e4d2ca3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
@@ -50,7 +50,7 @@
 
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
index 0c1776e..6e3682a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
@@ -37,7 +37,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
index f2ed57e..353e818 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@@ -325,7 +325,7 @@ entry:
 
 declare void @hhh(double, double)
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index 7e97116..8da0e11 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -694,8 +694,8 @@ bb1:
 ; CHECK:  .[[LABEL]]:
 ; CHECK:  ret
 
-attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !1 = !{!2, !2, i64 0}
 !2 = !{!"int", !3, i64 0}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
index 296435a..937bfe4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
@@ -519,8 +519,8 @@ while.cond:
   br label %while.cond
 }
 
-attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
index 45fa2be5..c05d661 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
@@ -79,8 +79,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind readnone speculatable }
   attributes #3 = { nounwind }
   
diff --git a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
index 4e86f52..071344d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
@@ -47,6 +47,6 @@ declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) #1
 ; Function Attrs: nounwind readnone
 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) #1
 
-attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
index 9b3d539..0ddcdcc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
@@ -8,5 +8,5 @@ define float @mul_add(float %a, float %b, float %c) local_unnamed_addr #0 {
     ret float %add
 }
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
index e17a0a9..54f752e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -enable-unsafe-fp-math -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -verify-machineinstrs | FileCheck %s
 
 define void @foo_2d(ptr %src) {
 ; CHECK-LABEL: %entry
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
index d2ce7e6..41f57bf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -84,7 +84,7 @@ bb3:                                              ; preds = %bb3, %bb
 ; Function Attrs: nounwind readnone
 declare i64 @llvm.objectsize.i64.p0(ptr, i1) #1
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !1 = !{!2, !2, i64 0}
diff --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
index 0b22fa4..c2b2c1e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1654,24 +1654,14 @@ define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) {
 }
 
 define <8 x i8> @dup_ld1_from_stack(ptr %__ret) {
-; CHECK-SD-LABEL: dup_ld1_from_stack:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sub sp, sp, #16
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT:    add x8, sp, #15
-; CHECK-SD-NEXT:    ld1r.8b { v0 }, [x8]
-; CHECK-SD-NEXT:    add sp, sp, #16
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: dup_ld1_from_stack:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w29, -16
-; CHECK-GI-NEXT:    add x8, sp, #15
-; CHECK-GI-NEXT:    ld1r.8b { v0 }, [x8]
-; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: dup_ld1_from_stack:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #15
+; CHECK-NEXT:    ld1r.8b { v0 }, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
 entry:
   %item = alloca i8, align 1
   %0 = load i8, ptr %item, align 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index 4cdc6cc..c6cf240 100644
--- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -107,7 +107,7 @@ define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Function Attrs: nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
index 82b34ef..bb1a6b0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
@@ -108,5 +108,5 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-rounding.ll b/llvm/test/CodeGen/AArch64/arm64-rounding.ll
index d487aab..3ce35bf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rounding.ll
@@ -201,4 +201,4 @@ entry:
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
index db65fdd..1486b3a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
@@ -36,6 +36,6 @@ for.end705.i:                                     ; preds = %for.body453.i
 
 declare void @f() local_unnamed_addr #1
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
index fc59350..593d629 100644
--- a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
@@ -18,7 +18,7 @@ entry:
   ret i32 %1
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
index 2e3b99f..c4bf7d2 100644
--- a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
+++ b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
@@ -61,4 +61,4 @@ declare dso_local void @e(...) local_unnamed_addr #0
 
 declare dso_local i64 @llvm.aarch64.space(i32, i64) local_unnamed_addr #0
 
-attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AArch64/consthoist-gep.ll b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
index 031ee35..7d2aaec 100644
--- a/llvm/test/CodeGen/AArch64/consthoist-gep.ll
+++ b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
@@ -108,7 +108,7 @@ bb19:                                             ; preds = %bb3, %bb
   ret void
 }
 
-attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/CodeGen/AArch64/csel-zero-float.ll b/llvm/test/CodeGen/AArch64/csel-zero-float.ll
index 6edde13..56a33cc 100644
--- a/llvm/test/CodeGen/AArch64/csel-zero-float.ll
+++ b/llvm/test/CodeGen/AArch64/csel-zero-float.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -enable-unsafe-fp-math < %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s
 ; There is no invocation to FileCheck as this
 ; caused a crash in "Post-RA pseudo instruction expansion"
 
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
index 61df396..e561481 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -32,5 +32,5 @@ main_:
 
 declare i32 @printf(ptr, ...) #1
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
index 1a83930..9193025 100644
--- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
 ; load zero-extended i32, bitcast to f64
-define double @_Z9load_u64_from_u32_testPj(ptr %n){
-; CHECK-LABEL: _Z9load_u64_from_u32_testPj:
+define double @load_u64_from_u32(ptr %n){
+; CHECK-LABEL: load_u64_from_u32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ret
@@ -15,8 +15,8 @@ entry:
 }
 
 ; load zero-extended i16, bitcast to f64
-define double @_Z9load_u64_from_u16_testPj(ptr %n){
-; CHECK-LABEL: _Z9load_u64_from_u16_testPj:
+define double @load_u64_from_u16(ptr %n){
+; CHECK-LABEL: load_u64_from_u16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ret
@@ -28,8 +28,8 @@ entry:
 }
 
 ; load zero-extended i8, bitcast to f64
-define double @_Z16load_u64_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u64_from_u8Ph:
+define double @load_u64_from_u8(ptr %n){
+; CHECK-LABEL: load_u64_from_u8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ret
@@ -41,8 +41,8 @@ entry:
 }
 
 ; load zero-extended i16, bitcast to f32
-define float @_Z17load_u32_from_u16Pt(ptr %n){
-; CHECK-LABEL: _Z17load_u32_from_u16Pt:
+define float @load_u32_from_u16(ptr %n){
+; CHECK-LABEL: load_u32_from_u16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ret
@@ -54,8 +54,8 @@ entry:
 }
 
 ; load zero-extended i8, bitcast to f32
-define float @_Z16load_u32_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u32_from_u8Ph:
+define float @load_u32_from_u8(ptr %n){
+; CHECK-LABEL: load_u32_from_u8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ret
@@ -67,8 +67,8 @@ entry:
 }
 
 ; load zero-extended i8, bitcast to f16
-define half @_Z16load_u16_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u16_from_u8Ph:
+define half @load_u16_from_u8(ptr %n){
+; CHECK-LABEL: load_u16_from_u8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
@@ -80,3 +80,504 @@ entry:
   ret half %1
 }
 
+
+define double @load_u64_from_u32_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off1(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+
+define double @load_u64_from_u32_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur w8, [x0, #2]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrh w8, [x0, #2]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #2]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off2(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #2]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+
+define double @load_u64_from_u32_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur w8, [x0, #255]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0, #255]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #255]
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off255(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0, #255]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #255]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, #255]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+define double @load_u64_from_u32_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr s0, [x0, #256]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #128]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #64]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off256(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #256]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #128]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #128]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+
+define double @load_u64_from_u32_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr s0, [x0, #16380]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 16380
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #8190 // =0x1ffe
+; CHECK-NEXT:    ldr h0, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8190
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #4095]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4095
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_offn(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #8190]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8190
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #4095]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4095
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #4095]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4095
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+define double @load_u64_from_u32_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add x8, x0, #4, lsl #12 // =16384
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 16384
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #4096]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8192
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #1024]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4096
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_offnp1(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add x8, x0, #2, lsl #12 // =8192
+; CHECK-NEXT:    ldr h0, [x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8192
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #2048]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4096
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #2048]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4096
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
index c2ef2fa..00a8c30 100644
--- a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
+++ b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
@@ -74,7 +74,7 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !10
 }
 
-attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/AArch64/recp-fastmath.ll b/llvm/test/CodeGen/AArch64/recp-fastmath.ll
index 9f00621..fa1da33 100644
--- a/llvm/test/CodeGen/AArch64/recp-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/recp-fastmath.ll
@@ -164,5 +164,5 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 {
 ; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
 }
 
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "reciprocal-estimates"="div,vec-div" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "reciprocal-estimates"="div,vec-div" }
diff --git a/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir
new file mode 100644
index 0000000..6f33a75
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir
@@ -0,0 +1,76 @@
+# RUN: llc -mtriple=aarch64 -simplify-mir -run-pass=shrink-wrap -o - %s | FileCheck %s
+--- |
+  declare double @foo()
+
+  define double @shrink_wrap_load_from_const_pool(double %q) {
+  entry:
+    %0 = fcmp oeq double %q, 3.125500e+02
+    br i1 %0, label %common.ret, label %if.else
+
+  common.ret:                                       ; preds = %if.else, %entry, %exit1
+    %common.ret.op = phi double [ %3, %exit1 ], [ 0.000000e+00, %entry ], [ 0.000000e+00, %if.else ]
+    ret double %common.ret.op
+
+  if.else:                                          ; preds = %entry
+    %1 = call double @foo()
+    %2 = fcmp oeq double %1, 0.000000e+00
+    br i1 %2, label %exit1, label %common.ret
+
+  exit1:                                            ; preds = %if.else
+    %3 = call double @foo()
+    br label %common.ret
+  }
+...
+# Following code has a load from constant pool. Accessing constant pool
+# must not be considered as a stack access and hence, shrink wrapping must
+# happen.
+# CHECK-LABEL:name:            shrink_wrap_load_from_const_pool
+# CHECK:  savePoint:
+# CHECK:    - point:           '%bb.3'
+# CHECK:  restorePoint:
+# CHECK:    - point:           '%bb.5'
+---
+name:            shrink_wrap_load_from_const_pool
+tracksRegLiveness: true
+constants:
+  - id:              0
+    value:           'double 3.125500e+02'
+    alignment:       8
+body:             |
+  bb.0.entry:
+    successors: %bb.4(0x50000000), %bb.2(0x30000000)
+    liveins: $d0
+
+    renamable $d1 = COPY $d0
+    renamable $x8 = ADRP target-flags(aarch64-page) %const.0
+    renamable $d2 = LDRDui killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) %const.0 :: (load (s64) from constant-pool)
+    renamable $d0 = FMOVD0
+    nofpexcept FCMPDrr killed renamable $d1, killed renamable $d2, implicit-def $nzcv, implicit $fpcr
+    Bcc 1, %bb.2, implicit killed $nzcv
+
+  bb.4:
+    liveins: $d0
+
+  bb.1.common.ret:
+    liveins: $d0
+
+    RET_ReallyLR implicit $d0
+
+  bb.2.if.else:
+    successors: %bb.3(0x50000000), %bb.1(0x30000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $d1 = COPY $d0
+    renamable $d0 = FMOVD0
+    nofpexcept FCMPDri killed renamable $d1, implicit-def $nzcv, implicit $fpcr
+    Bcc 1, %bb.1, implicit killed $nzcv
+    B %bb.3
+
+  bb.3.exit1:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    B %bb.1
+...
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
index 66ac04e..22abb8c 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
@@ -64,6 +64,6 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
 ; Function Attrs: argmemonly nounwind willreturn
 declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
 
-attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
index e5725bc..d689a76 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
@@ -158,10 +158,10 @@ eh.resume:                                        ; preds = %lpad.body
   resume { ptr, i32 } %eh.lpad-body
 }
 
-attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
 attributes #2 = { nounwind readnone }
-attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #4 = { nounwind }
 attributes #5 = { noreturn }
 
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
index 91adf82..7483622 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
@@ -77,6 +77,6 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
-attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
diff --git a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
index 523eda61..e41d82c 100644
--- a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
@@ -54,7 +54,7 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 ;--- pic.ll
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll b/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll
index f78fcea..b8dcd6f 100644
--- a/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll
+++ b/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple aarch64-none-linux-gnu -enable-unsafe-fp-math -mattr=+fullfp16 < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+fullfp16 < %s | FileCheck %s
 
 define half @scvtf_f16_2(i32 %state) {
 ; CHECK-LABEL: scvtf_f16_2:
diff --git a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
index 623ea22..89b3b89 100644
--- a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
+++ b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
@@ -24,7 +24,7 @@ define void @fn(ptr %argA, ptr %argB, ptr %a) #0 align 2 {
 
 ; CHECK: ret
 
-attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame5.mir b/llvm/test/CodeGen/AArch64/wineh-frame5.mir
index 97c5c85..32580f4 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame5.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame5.mir
@@ -64,9 +64,9 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
-  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #3 = { nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame6.mir b/llvm/test/CodeGen/AArch64/wineh-frame6.mir
index 5ba7842..d76fae1 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame6.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame6.mir
@@ -47,8 +47,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame7.mir b/llvm/test/CodeGen/AArch64/wineh-frame7.mir
index 1599098..d4e71d9 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame7.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame7.mir
@@ -71,8 +71,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame8.mir b/llvm/test/CodeGen/AArch64/wineh-frame8.mir
index 9de99ac..56f92f2 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame8.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame8.mir
@@ -29,7 +29,7 @@
     ret i32 %add
   }
 
-  attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 
 ...
 ---
diff --git a/llvm/test/CodeGen/AArch64/wineh5.mir b/llvm/test/CodeGen/AArch64/wineh5.mir
index efdd4b0..1c09b78 100644
--- a/llvm/test/CodeGen/AArch64/wineh5.mir
+++ b/llvm/test/CodeGen/AArch64/wineh5.mir
@@ -73,8 +73,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind }
 
   !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
index 2f631c2..52d0dff 100644
--- a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
+++ b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
@@ -56,9 +56,9 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
-  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #3 = { nounwind }
 
   !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
index d9ac9a7..de1bb47 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
 
 # Test that we fold correct element from G_UNMERGE_VALUES into fma
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index 52b1beb..91f2f6f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11
 
 ---
 name:            fract_f64_neg
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 5171403..7714c03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
 ; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13..7b81669 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
 ; CHECK-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index b72eba8..8088c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
 ; CHECK-LABEL: s_add64_32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, s2
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s3
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_addc_u32 s2, s4, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %sum64 = add i64 %val64A, %val64B
@@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
 define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_uadd_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s2, s6
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT:    s_addc_u32 s8, s3, s7
+; CHECK-NEXT:    s_add_u32 s6, s2, s6
+; CHECK-NEXT:    s_addc_u32 s7, s3, s7
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s4
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v7
 ; CHECK-NEXT:    v_readfirstlane_b32 s2, v6
-; CHECK-NEXT:    v_mov_b32_e32 v4, s10
-; CHECK-NEXT:    v_mov_b32_e32 v5, s8
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_mov_b32 s3, s2
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_sub_u32 s10, s2, s6
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT:    s_subb_u32 s8, s3, s7
+; CHECK-NEXT:    s_sub_u32 s6, s2, s6
+; CHECK-NEXT:    s_subb_u32 s7, s3, s7
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_sub_u32 s0, s0, s4
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v7
 ; CHECK-NEXT:    v_readfirstlane_b32 s2, v6
-; CHECK-NEXT:    v_mov_b32_e32 v4, s10
-; CHECK-NEXT:    v_mov_b32_e32 v5, s8
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_mov_b32 s3, s2
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
 ; CHECK-LABEL: s_uadd_i64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, s2
-; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s3
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_uadd_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_n1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, -1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, -1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 948811e..51df8c3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7821,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
 ; GFX6-NEXT:    s_add_u32 s16, s0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s15
 ; GFX6-NEXT:    s_mul_i32 s0, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
@@ -7855,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_add_u32 s15, s16, s0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s12
 ; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s12
@@ -7881,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX6-NEXT:    s_addc_u32 s4, s4, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s7, s14
-; GFX6-NEXT:    s_add_u32 s14, s1, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    s_add_u32 s16, s1, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT:    s_addc_u32 s15, 0, s4
+; GFX6-NEXT:    s_addc_u32 s17, 0, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_mul_i32 s4, s10, s15
+; GFX6-NEXT:    s_mul_i32 s4, s10, s17
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
-; GFX6-NEXT:    s_mul_i32 s5, s11, s14
-; GFX6-NEXT:    s_add_i32 s16, s4, s5
-; GFX6-NEXT:    s_sub_i32 s17, s7, s16
-; GFX6-NEXT:    s_mul_i32 s4, s10, s14
+; GFX6-NEXT:    s_mul_i32 s5, s11, s16
+; GFX6-NEXT:    s_add_i32 s18, s4, s5
+; GFX6-NEXT:    s_sub_i32 s14, s7, s18
+; GFX6-NEXT:    s_mul_i32 s4, s10, s16
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s18, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_subb_u32 s17, s17, s11
-; GFX6-NEXT:    s_sub_u32 s19, s6, s10
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s4, s5
+; GFX6-NEXT:    s_subb_u32 s19, s14, s11
+; GFX6-NEXT:    s_sub_u32 s20, s6, s10
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s19, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s14, s11
+; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s20, s10
+; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s14, s11
+; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
+; GFX6-NEXT:    s_add_u32 s15, s16, 1
+; GFX6-NEXT:    s_addc_u32 s19, s17, 0
+; GFX6-NEXT:    s_add_u32 s20, s16, 2
+; GFX6-NEXT:    s_addc_u32 s21, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b32 s14, s20, s15
+; GFX6-NEXT:    s_cselect_b32 s15, s21, s19
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s4, s17, 0
+; GFX6-NEXT:    s_subb_u32 s4, s7, s18
 ; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s19, s10
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
-; GFX6-NEXT:    s_cselect_b32 s4, s17, s5
-; GFX6-NEXT:    s_add_u32 s5, s14, 1
-; GFX6-NEXT:    s_addc_u32 s17, s15, 0
-; GFX6-NEXT:    s_add_u32 s19, s14, 2
-; GFX6-NEXT:    s_addc_u32 s20, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s19, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s20, s17
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_subb_u32 s7, s7, s16
-; GFX6-NEXT:    s_cmp_ge_u32 s7, s11
-; GFX6-NEXT:    s_cselect_b32 s16, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s10
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s7, s11
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s16
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s15
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s14
+; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
+; GFX6-NEXT:    s_cselect_b32 s4, s6, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s5, s15, s17
+; GFX6-NEXT:    s_cselect_b32 s4, s14, s16
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s6
@@ -7949,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s10, 0, s8
-; GFX9-NEXT:    s_subb_u32 s11, 0, s9
+; GFX9-NEXT:    s_sub_u32 s4, 0, s8
+; GFX9-NEXT:    s_subb_u32 s5, 0, s9
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -7960,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    s_mul_i32 s5, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s4
-; GFX9-NEXT:    s_mul_i32 s13, s11, s4
-; GFX9-NEXT:    s_add_i32 s5, s14, s5
-; GFX9-NEXT:    s_mul_i32 s15, s10, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s13
-; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s15
-; GFX9-NEXT:    s_mul_i32 s16, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX9-NEXT:    s_mul_i32 s12, s4, s10
+; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s11
+; GFX9-NEXT:    s_mul_i32 s13, s5, s11
+; GFX9-NEXT:    s_add_i32 s12, s14, s12
+; GFX9-NEXT:    s_mul_i32 s15, s4, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_hi_u32 s14, s11, s15
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
 ; GFX9-NEXT:    s_add_u32 s14, s14, s16
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT:    s_mul_i32 s15, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT:    s_mul_i32 s15, s10, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s12
 ; GFX9-NEXT:    s_addc_u32 s13, s13, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
-; GFX9-NEXT:    s_mul_i32 s5, s12, s5
-; GFX9-NEXT:    s_add_u32 s5, s13, s5
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s12, s13, s12
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_i32 s4, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s14
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s11, s11, s14
-; GFX9-NEXT:    s_add_i32 s4, s4, s11
-; GFX9-NEXT:    s_mul_i32 s10, s10, s14
-; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT:    s_mul_i32 s13, s12, s10
-; GFX9-NEXT:    s_mul_i32 s16, s14, s4
-; GFX9-NEXT:    s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT:    s_add_u32 s10, s10, s16
+; GFX9-NEXT:    s_add_u32 s11, s11, s12
+; GFX9-NEXT:    s_addc_u32 s10, s10, s13
+; GFX9-NEXT:    s_mul_i32 s12, s4, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s11
+; GFX9-NEXT:    s_add_i32 s12, s13, s12
+; GFX9-NEXT:    s_mul_i32 s5, s5, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s11
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s4
+; GFX9-NEXT:    s_mul_i32 s14, s10, s4
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s4, s11, s4
+; GFX9-NEXT:    s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s10, s10, s13
-; GFX9-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT:    s_addc_u32 s10, s15, s11
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s12
+; GFX9-NEXT:    s_addc_u32 s4, s15, s13
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s12, s4
-; GFX9-NEXT:    s_add_u32 s4, s10, s4
-; GFX9-NEXT:    s_addc_u32 s10, 0, s5
-; GFX9-NEXT:    s_add_u32 s11, s14, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s4, s4, s12
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s11, s11, s4
+; GFX9-NEXT:    s_addc_u32 s10, s10, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
@@ -8028,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_addc_u32 s11, s12, s15
 ; GFX9-NEXT:    s_addc_u32 s12, s14, 0
 ; GFX9-NEXT:    s_mul_i32 s10, s3, s10
-; GFX9-NEXT:    s_add_u32 s14, s11, s10
-; GFX9-NEXT:    s_addc_u32 s15, 0, s12
-; GFX9-NEXT:    s_mul_i32 s10, s8, s15
-; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s14
+; GFX9-NEXT:    s_add_u32 s13, s11, s10
+; GFX9-NEXT:    s_addc_u32 s12, 0, s12
+; GFX9-NEXT:    s_mul_i32 s10, s8, s12
+; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s13
 ; GFX9-NEXT:    s_add_i32 s10, s11, s10
-; GFX9-NEXT:    s_mul_i32 s11, s9, s14
-; GFX9-NEXT:    s_add_i32 s16, s10, s11
-; GFX9-NEXT:    s_sub_i32 s12, s3, s16
-; GFX9-NEXT:    s_mul_i32 s10, s8, s14
+; GFX9-NEXT:    s_mul_i32 s11, s9, s13
+; GFX9-NEXT:    s_add_i32 s14, s10, s11
+; GFX9-NEXT:    s_sub_i32 s15, s3, s14
+; GFX9-NEXT:    s_mul_i32 s10, s8, s13
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s10
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s17, s12, s9
-; GFX9-NEXT:    s_sub_u32 s18, s2, s8
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_subb_u32 s12, s17, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s12, s9
-; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s18, s8
+; GFX9-NEXT:    s_subb_u32 s15, s15, s9
+; GFX9-NEXT:    s_sub_u32 s16, s2, s8
+; GFX9-NEXT:    s_subb_u32 s15, s15, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s15, s9
 ; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, s9
-; GFX9-NEXT:    s_cselect_b32 s12, s17, s13
-; GFX9-NEXT:    s_add_u32 s13, s14, 1
-; GFX9-NEXT:    s_addc_u32 s17, s15, 0
-; GFX9-NEXT:    s_add_u32 s18, s14, 2
-; GFX9-NEXT:    s_addc_u32 s19, s15, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b32 s12, s18, s13
-; GFX9-NEXT:    s_cselect_b32 s13, s19, s17
+; GFX9-NEXT:    s_cmp_ge_u32 s16, s8
+; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s15, s9
+; GFX9-NEXT:    s_cselect_b32 s15, s16, s17
+; GFX9-NEXT:    s_add_u32 s16, s13, 1
+; GFX9-NEXT:    s_addc_u32 s17, s12, 0
+; GFX9-NEXT:    s_add_u32 s18, s13, 2
+; GFX9-NEXT:    s_addc_u32 s19, s12, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b32 s15, s18, s16
+; GFX9-NEXT:    s_cselect_b32 s16, s19, s17
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s3, s3, s16
+; GFX9-NEXT:    s_subb_u32 s3, s3, s14
 ; GFX9-NEXT:    s_cmp_ge_u32 s3, s9
 ; GFX9-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
@@ -8067,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_cmp_eq_u32 s3, s9
 ; GFX9-NEXT:    s_cselect_b32 s2, s2, s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_cselect_b32 s3, s13, s15
-; GFX9-NEXT:    s_cselect_b32 s2, s12, s14
+; GFX9-NEXT:    s_cselect_b32 s3, s16, s12
+; GFX9-NEXT:    s_cselect_b32 s2, s15, s13
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s4
@@ -8328,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_addc_u32 s17, 0, s18
 ; GFX6-NEXT:    s_add_u32 s18, s12, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s14, v0
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, s17
 ; GFX6-NEXT:    s_mul_i32 s12, s14, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
@@ -8362,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s15, s18, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s16, s14
 ; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s12
@@ -8387,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s9, s14
-; GFX6-NEXT:    s_add_u32 s17, s15, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s17
+; GFX6-NEXT:    s_add_u32 s18, s15, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
-; GFX6-NEXT:    s_mul_i32 s14, s6, s16
+; GFX6-NEXT:    s_addc_u32 s19, 0, s16
+; GFX6-NEXT:    s_mul_i32 s14, s6, s19
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
 ; GFX6-NEXT:    s_add_i32 s14, s15, s14
-; GFX6-NEXT:    s_mul_i32 s15, s7, s17
-; GFX6-NEXT:    s_add_i32 s18, s14, s15
-; GFX6-NEXT:    s_sub_i32 s19, s9, s18
-; GFX6-NEXT:    s_mul_i32 s14, s6, s17
+; GFX6-NEXT:    s_mul_i32 s15, s7, s18
+; GFX6-NEXT:    s_add_i32 s20, s14, s15
+; GFX6-NEXT:    s_sub_i32 s16, s9, s20
+; GFX6-NEXT:    s_mul_i32 s14, s6, s18
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s20, s14, s15
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s19, s19, s7
-; GFX6-NEXT:    s_sub_u32 s21, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s17, s14, s15
+; GFX6-NEXT:    s_subb_u32 s21, s16, s7
+; GFX6-NEXT:    s_sub_u32 s22, s8, s6
+; GFX6-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GFX6-NEXT:    s_or_b32 s16, s16, s17
+; GFX6-NEXT:    s_subb_u32 s16, s21, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s16, s7
+; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s16, s7
+; GFX6-NEXT:    s_cselect_b32 s16, s21, s17
+; GFX6-NEXT:    s_add_u32 s17, s18, 1
+; GFX6-NEXT:    s_addc_u32 s21, s19, 0
+; GFX6-NEXT:    s_add_u32 s22, s18, 2
+; GFX6-NEXT:    s_addc_u32 s23, s19, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b32 s16, s22, s17
+; GFX6-NEXT:    s_cselect_b32 s17, s23, s21
 ; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_subb_u32 s14, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s14, s7
-; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s21, s6
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s14, s7
-; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
-; GFX6-NEXT:    s_add_u32 s15, s17, 1
-; GFX6-NEXT:    s_addc_u32 s19, s16, 0
-; GFX6-NEXT:    s_add_u32 s21, s17, 2
-; GFX6-NEXT:    s_addc_u32 s22, s16, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
-; GFX6-NEXT:    s_cselect_b32 s15, s22, s19
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s18
+; GFX6-NEXT:    s_subb_u32 s9, s9, s20
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s18
+; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s15, s16
-; GFX6-NEXT:    s_cselect_b32 s6, s14, s17
+; GFX6-NEXT:    s_cselect_b32 s7, s17, s19
+; GFX6-NEXT:    s_cselect_b32 s6, s16, s18
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT:    s_sub_u32 s14, s6, s2
-; GFX6-NEXT:    s_subb_u32 s15, s7, s3
+; GFX6-NEXT:    s_sub_u32 s16, s6, s2
+; GFX6-NEXT:    s_subb_u32 s17, s7, s3
 ; GFX6-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s6
 ; GFX6-NEXT:    s_mov_b32 s7, s6
@@ -8454,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s16
+; GFX6-NEXT:    s_mul_i32 s1, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX6-NEXT:    s_mul_i32 s0, s13, s2
 ; GFX6-NEXT:    s_add_i32 s1, s3, s1
 ; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s17, s12, s2
+; GFX6-NEXT:    s_mul_i32 s15, s12, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GFX6-NEXT:    s_mul_i32 s4, s2, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
 ; GFX6-NEXT:    s_add_u32 s4, s18, s4
 ; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s17, s16, s17
+; GFX6-NEXT:    s_mul_i32 s15, s14, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s17
+; GFX6-NEXT:    s_add_u32 s4, s4, s15
 ; GFX6-NEXT:    s_addc_u32 s4, s5, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s16, s3
+; GFX6-NEXT:    s_mul_i32 s3, s14, s3
 ; GFX6-NEXT:    s_add_u32 s3, s4, s3
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s5
 ; GFX6-NEXT:    s_add_u32 s5, s2, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s4, s16, s4
+; GFX6-NEXT:    s_addc_u32 s4, s14, s4
 ; GFX6-NEXT:    s_mul_i32 s2, s12, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX6-NEXT:    s_add_i32 s2, s3, s2
@@ -8501,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    s_mul_i32 s13, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX6-NEXT:    s_add_u32 s13, s15, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
 ; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
 ; GFX6-NEXT:    s_add_u32 s3, s13, s3
-; GFX6-NEXT:    s_addc_u32 s3, s16, s12
+; GFX6-NEXT:    s_addc_u32 s3, s14, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    s_addc_u32 s12, s12, 0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
@@ -8517,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s13, s5, s2
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s4, s12
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
@@ -8529,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
 ; GFX6-NEXT:    s_mul_i32 s2, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT:    s_add_u32 s2, s17, s2
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_add_u32 s2, s15, s2
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
 ; GFX6-NEXT:    s_mul_i32 s13, s11, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v1
 ; GFX6-NEXT:    s_add_u32 s2, s2, s13
-; GFX6-NEXT:    s_addc_u32 s2, s16, s17
+; GFX6-NEXT:    s_addc_u32 s2, s14, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX6-NEXT:    s_mul_i32 s12, s11, s12
-; GFX6-NEXT:    s_add_u32 s16, s2, s12
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    s_add_u32 s18, s2, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    s_addc_u32 s17, 0, s13
-; GFX6-NEXT:    s_mul_i32 s12, s8, s17
+; GFX6-NEXT:    s_addc_u32 s19, 0, s13
+; GFX6-NEXT:    s_mul_i32 s12, s8, s19
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_add_i32 s12, s13, s12
-; GFX6-NEXT:    s_mul_i32 s13, s9, s16
-; GFX6-NEXT:    s_add_i32 s18, s12, s13
-; GFX6-NEXT:    s_sub_i32 s19, s11, s18
-; GFX6-NEXT:    s_mul_i32 s12, s8, s16
+; GFX6-NEXT:    s_mul_i32 s13, s9, s18
+; GFX6-NEXT:    s_add_i32 s20, s12, s13
+; GFX6-NEXT:    s_sub_i32 s14, s11, s20
+; GFX6-NEXT:    s_mul_i32 s12, s8, s18
 ; GFX6-NEXT:    s_sub_u32 s10, s10, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s20, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s19, s19, s9
-; GFX6-NEXT:    s_sub_u32 s21, s10, s8
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s12, s13
+; GFX6-NEXT:    s_subb_u32 s21, s14, s9
+; GFX6-NEXT:    s_sub_u32 s22, s10, s8
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s21, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s14, s9
+; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s22, s8
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s14, s9
+; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
+; GFX6-NEXT:    s_add_u32 s15, s18, 1
+; GFX6-NEXT:    s_addc_u32 s21, s19, 0
+; GFX6-NEXT:    s_add_u32 s22, s18, 2
+; GFX6-NEXT:    s_addc_u32 s23, s19, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b32 s14, s22, s15
+; GFX6-NEXT:    s_cselect_b32 s15, s23, s21
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s12, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s12, s9
-; GFX6-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s21, s8
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s12, s9
-; GFX6-NEXT:    s_cselect_b32 s12, s19, s13
-; GFX6-NEXT:    s_add_u32 s13, s16, 1
-; GFX6-NEXT:    s_addc_u32 s19, s17, 0
-; GFX6-NEXT:    s_add_u32 s21, s16, 2
-; GFX6-NEXT:    s_addc_u32 s22, s17, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b32 s12, s21, s13
-; GFX6-NEXT:    s_cselect_b32 s13, s22, s19
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s11, s11, s18
+; GFX6-NEXT:    s_subb_u32 s11, s11, s20
 ; GFX6-NEXT:    s_cmp_ge_u32 s11, s9
-; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s10, s8
 ; GFX6-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s11, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s18
+; GFX6-NEXT:    s_cselect_b32 s8, s8, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s13, s17
-; GFX6-NEXT:    s_cselect_b32 s8, s12, s16
+; GFX6-NEXT:    s_cselect_b32 s9, s15, s19
+; GFX6-NEXT:    s_cselect_b32 s8, s14, s18
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s4, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s5, s7, s5
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    v_mov_b32_e32 v1, s15
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8614,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_u32 s14, 0, s6
-; GFX9-NEXT:    s_subb_u32 s15, 0, s7
+; GFX9-NEXT:    s_sub_u32 s12, 0, s6
+; GFX9-NEXT:    s_subb_u32 s13, 0, s7
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8624,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX9-NEXT:    s_mul_i32 s13, s14, s16
-; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s12
-; GFX9-NEXT:    s_mul_i32 s17, s15, s12
-; GFX9-NEXT:    s_add_i32 s13, s18, s13
-; GFX9-NEXT:    s_mul_i32 s19, s14, s12
-; GFX9-NEXT:    s_add_i32 s13, s13, s17
-; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s19
-; GFX9-NEXT:    s_mul_i32 s20, s12, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s13
+; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX9-NEXT:    s_mul_i32 s16, s12, s14
+; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s15
+; GFX9-NEXT:    s_mul_i32 s17, s13, s15
+; GFX9-NEXT:    s_add_i32 s16, s18, s16
+; GFX9-NEXT:    s_mul_i32 s19, s12, s15
+; GFX9-NEXT:    s_add_i32 s16, s16, s17
+; GFX9-NEXT:    s_mul_hi_u32 s18, s15, s19
+; GFX9-NEXT:    s_mul_i32 s20, s15, s16
+; GFX9-NEXT:    s_mul_hi_u32 s17, s15, s16
 ; GFX9-NEXT:    s_add_u32 s18, s18, s20
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_mul_hi_u32 s20, s16, s19
-; GFX9-NEXT:    s_mul_i32 s19, s16, s19
+; GFX9-NEXT:    s_mul_hi_u32 s20, s14, s19
+; GFX9-NEXT:    s_mul_i32 s19, s14, s19
 ; GFX9-NEXT:    s_add_u32 s18, s18, s19
-; GFX9-NEXT:    s_mul_hi_u32 s21, s16, s13
+; GFX9-NEXT:    s_mul_hi_u32 s21, s14, s16
 ; GFX9-NEXT:    s_addc_u32 s17, s17, s20
 ; GFX9-NEXT:    s_addc_u32 s18, s21, 0
-; GFX9-NEXT:    s_mul_i32 s13, s16, s13
-; GFX9-NEXT:    s_add_u32 s13, s17, s13
+; GFX9-NEXT:    s_mul_i32 s16, s14, s16
+; GFX9-NEXT:    s_add_u32 s16, s17, s16
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s18
-; GFX9-NEXT:    s_add_u32 s18, s12, s13
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_addc_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_i32 s12, s14, s16
-; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s18
-; GFX9-NEXT:    s_add_i32 s12, s13, s12
-; GFX9-NEXT:    s_mul_i32 s15, s15, s18
-; GFX9-NEXT:    s_add_i32 s12, s12, s15
-; GFX9-NEXT:    s_mul_i32 s14, s14, s18
-; GFX9-NEXT:    s_mul_hi_u32 s15, s16, s14
-; GFX9-NEXT:    s_mul_i32 s17, s16, s14
-; GFX9-NEXT:    s_mul_i32 s20, s18, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s18, s14
-; GFX9-NEXT:    s_mul_hi_u32 s19, s18, s12
-; GFX9-NEXT:    s_add_u32 s14, s14, s20
+; GFX9-NEXT:    s_add_u32 s15, s15, s16
+; GFX9-NEXT:    s_addc_u32 s14, s14, s17
+; GFX9-NEXT:    s_mul_i32 s16, s12, s14
+; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT:    s_add_i32 s16, s17, s16
+; GFX9-NEXT:    s_mul_i32 s13, s13, s15
+; GFX9-NEXT:    s_add_i32 s16, s16, s13
+; GFX9-NEXT:    s_mul_i32 s12, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s14, s12
+; GFX9-NEXT:    s_mul_i32 s18, s14, s12
+; GFX9-NEXT:    s_mul_i32 s20, s15, s16
+; GFX9-NEXT:    s_mul_hi_u32 s12, s15, s12
+; GFX9-NEXT:    s_mul_hi_u32 s19, s15, s16
+; GFX9-NEXT:    s_add_u32 s12, s12, s20
 ; GFX9-NEXT:    s_addc_u32 s19, 0, s19
-; GFX9-NEXT:    s_add_u32 s14, s14, s17
-; GFX9-NEXT:    s_mul_hi_u32 s13, s16, s12
-; GFX9-NEXT:    s_addc_u32 s14, s19, s15
+; GFX9-NEXT:    s_add_u32 s12, s12, s18
+; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s16
+; GFX9-NEXT:    s_addc_u32 s12, s19, s17
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-NEXT:    s_mul_i32 s12, s16, s12
-; GFX9-NEXT:    s_add_u32 s12, s14, s12
-; GFX9-NEXT:    s_addc_u32 s14, 0, s13
-; GFX9-NEXT:    s_add_u32 s15, s18, s12
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_addc_u32 s14, s16, s14
+; GFX9-NEXT:    s_mul_i32 s16, s14, s16
+; GFX9-NEXT:    s_add_u32 s12, s12, s16
+; GFX9-NEXT:    s_addc_u32 s13, 0, s13
+; GFX9-NEXT:    s_add_u32 s15, s15, s12
+; GFX9-NEXT:    s_addc_u32 s14, s14, s13
 ; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX9-NEXT:    s_add_u32 s8, s8, s12
 ; GFX9-NEXT:    s_mov_b32 s13, s12
@@ -8691,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_addc_u32 s15, s16, s19
 ; GFX9-NEXT:    s_addc_u32 s16, s18, 0
 ; GFX9-NEXT:    s_mul_i32 s14, s9, s14
-; GFX9-NEXT:    s_add_u32 s18, s15, s14
-; GFX9-NEXT:    s_addc_u32 s19, 0, s16
-; GFX9-NEXT:    s_mul_i32 s14, s6, s19
-; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s18
+; GFX9-NEXT:    s_add_u32 s17, s15, s14
+; GFX9-NEXT:    s_addc_u32 s16, 0, s16
+; GFX9-NEXT:    s_mul_i32 s14, s6, s16
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s17
 ; GFX9-NEXT:    s_add_i32 s14, s15, s14
-; GFX9-NEXT:    s_mul_i32 s15, s7, s18
-; GFX9-NEXT:    s_add_i32 s20, s14, s15
-; GFX9-NEXT:    s_sub_i32 s16, s9, s20
-; GFX9-NEXT:    s_mul_i32 s14, s6, s18
+; GFX9-NEXT:    s_mul_i32 s15, s7, s17
+; GFX9-NEXT:    s_add_i32 s18, s14, s15
+; GFX9-NEXT:    s_sub_i32 s19, s9, s18
+; GFX9-NEXT:    s_mul_i32 s14, s6, s17
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s21, s16, s7
-; GFX9-NEXT:    s_sub_u32 s22, s8, s6
-; GFX9-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GFX9-NEXT:    s_subb_u32 s16, s21, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s16, s7
-; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX9-NEXT:    s_subb_u32 s19, s19, s7
+; GFX9-NEXT:    s_sub_u32 s20, s8, s6
+; GFX9-NEXT:    s_subb_u32 s19, s19, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s19, s7
 ; GFX9-NEXT:    s_cselect_b32 s21, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s16, s7
-; GFX9-NEXT:    s_cselect_b32 s16, s21, s17
-; GFX9-NEXT:    s_add_u32 s17, s18, 1
-; GFX9-NEXT:    s_addc_u32 s21, s19, 0
-; GFX9-NEXT:    s_add_u32 s22, s18, 2
-; GFX9-NEXT:    s_addc_u32 s23, s19, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s16, s22, s17
-; GFX9-NEXT:    s_cselect_b32 s17, s23, s21
+; GFX9-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX9-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX9-NEXT:    s_cselect_b32 s19, s20, s21
+; GFX9-NEXT:    s_add_u32 s20, s17, 1
+; GFX9-NEXT:    s_addc_u32 s21, s16, 0
+; GFX9-NEXT:    s_add_u32 s22, s17, 2
+; GFX9-NEXT:    s_addc_u32 s23, s16, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b32 s19, s22, s20
+; GFX9-NEXT:    s_cselect_b32 s20, s23, s21
 ; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s9, s9, s20
+; GFX9-NEXT:    s_subb_u32 s9, s9, s18
 ; GFX9-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8730,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b32 s7, s17, s19
-; GFX9-NEXT:    s_cselect_b32 s6, s16, s18
+; GFX9-NEXT:    s_cselect_b32 s7, s20, s16
+; GFX9-NEXT:    s_cselect_b32 s6, s19, s17
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX9-NEXT:    s_sub_u32 s14, s6, s2
-; GFX9-NEXT:    s_subb_u32 s15, s7, s3
+; GFX9-NEXT:    s_sub_u32 s12, s6, s2
+; GFX9-NEXT:    s_subb_u32 s13, s7, s3
 ; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
 ; GFX9-NEXT:    s_mov_b32 s3, s2
@@ -8744,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s8, 0, s6
-; GFX9-NEXT:    s_subb_u32 s9, 0, s7
+; GFX9-NEXT:    s_sub_u32 s4, 0, s6
+; GFX9-NEXT:    s_subb_u32 s5, 0, s7
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -8755,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s13, v2
-; GFX9-NEXT:    s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT:    s_mul_i32 s16, s8, s13
-; GFX9-NEXT:    s_mul_i32 s5, s9, s4
-; GFX9-NEXT:    s_add_i32 s12, s12, s16
-; GFX9-NEXT:    s_add_i32 s12, s12, s5
-; GFX9-NEXT:    s_mul_i32 s17, s8, s4
-; GFX9-NEXT:    s_mul_i32 s16, s4, s12
-; GFX9-NEXT:    s_mul_hi_u32 s18, s4, s17
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s12
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s8
+; GFX9-NEXT:    s_mul_i32 s16, s4, s15
+; GFX9-NEXT:    s_mul_i32 s9, s5, s8
+; GFX9-NEXT:    s_add_i32 s14, s14, s16
+; GFX9-NEXT:    s_add_i32 s14, s14, s9
+; GFX9-NEXT:    s_mul_i32 s17, s4, s8
+; GFX9-NEXT:    s_mul_i32 s16, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s18, s8, s17
+; GFX9-NEXT:    s_mul_hi_u32 s9, s8, s14
 ; GFX9-NEXT:    s_add_u32 s16, s18, s16
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_mul_hi_u32 s19, s13, s17
-; GFX9-NEXT:    s_mul_i32 s17, s13, s17
+; GFX9-NEXT:    s_addc_u32 s9, 0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s19, s15, s17
+; GFX9-NEXT:    s_mul_i32 s17, s15, s17
 ; GFX9-NEXT:    s_add_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_hi_u32 s18, s13, s12
-; GFX9-NEXT:    s_addc_u32 s5, s5, s19
+; GFX9-NEXT:    s_mul_hi_u32 s18, s15, s14
+; GFX9-NEXT:    s_addc_u32 s9, s9, s19
 ; GFX9-NEXT:    s_addc_u32 s16, s18, 0
-; GFX9-NEXT:    s_mul_i32 s12, s13, s12
-; GFX9-NEXT:    s_add_u32 s5, s5, s12
-; GFX9-NEXT:    s_addc_u32 s12, 0, s16
-; GFX9-NEXT:    s_add_u32 s16, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s13, s12
-; GFX9-NEXT:    s_mul_i32 s4, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s16
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s9, s9, s16
-; GFX9-NEXT:    s_add_i32 s4, s4, s9
-; GFX9-NEXT:    s_mul_i32 s8, s8, s16
-; GFX9-NEXT:    s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT:    s_mul_i32 s13, s12, s8
-; GFX9-NEXT:    s_mul_i32 s18, s16, s4
-; GFX9-NEXT:    s_mul_hi_u32 s8, s16, s8
-; GFX9-NEXT:    s_mul_hi_u32 s17, s16, s4
-; GFX9-NEXT:    s_add_u32 s8, s8, s18
+; GFX9-NEXT:    s_mul_i32 s14, s15, s14
+; GFX9-NEXT:    s_add_u32 s9, s9, s14
+; GFX9-NEXT:    s_addc_u32 s14, 0, s16
+; GFX9-NEXT:    s_add_u32 s8, s8, s9
+; GFX9-NEXT:    s_addc_u32 s9, s15, s14
+; GFX9-NEXT:    s_mul_i32 s14, s4, s9
+; GFX9-NEXT:    s_mul_hi_u32 s15, s4, s8
+; GFX9-NEXT:    s_add_i32 s14, s15, s14
+; GFX9-NEXT:    s_mul_i32 s5, s5, s8
+; GFX9-NEXT:    s_add_i32 s14, s14, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s15, s9, s4
+; GFX9-NEXT:    s_mul_i32 s16, s9, s4
+; GFX9-NEXT:    s_mul_i32 s18, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT:    s_mul_hi_u32 s17, s8, s14
+; GFX9-NEXT:    s_add_u32 s4, s4, s18
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_add_u32 s8, s8, s13
-; GFX9-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT:    s_addc_u32 s8, s17, s9
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
+; GFX9-NEXT:    s_mul_hi_u32 s5, s9, s14
+; GFX9-NEXT:    s_addc_u32 s4, s17, s15
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s12, s4
-; GFX9-NEXT:    s_add_u32 s4, s8, s4
-; GFX9-NEXT:    s_addc_u32 s8, 0, s5
-; GFX9-NEXT:    s_add_u32 s13, s16, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s8
+; GFX9-NEXT:    s_mul_i32 s14, s9, s14
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s14, s8, s4
+; GFX9-NEXT:    s_addc_u32 s15, s9, s5
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    s_add_u32 s8, s10, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s9, s11, s4
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT:    s_mul_i32 s11, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s13
-; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s12
+; GFX9-NEXT:    s_mul_i32 s11, s8, s15
+; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s15
 ; GFX9-NEXT:    s_add_u32 s11, s16, s11
 ; GFX9-NEXT:    s_addc_u32 s10, 0, s10
-; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s13
-; GFX9-NEXT:    s_mul_i32 s13, s9, s13
-; GFX9-NEXT:    s_add_u32 s11, s11, s13
-; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s12
+; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s14
+; GFX9-NEXT:    s_mul_i32 s14, s9, s14
+; GFX9-NEXT:    s_add_u32 s11, s11, s14
+; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s15
 ; GFX9-NEXT:    s_addc_u32 s10, s10, s17
 ; GFX9-NEXT:    s_addc_u32 s11, s16, 0
-; GFX9-NEXT:    s_mul_i32 s12, s9, s12
-; GFX9-NEXT:    s_add_u32 s16, s10, s12
-; GFX9-NEXT:    s_addc_u32 s17, 0, s11
-; GFX9-NEXT:    s_mul_i32 s10, s6, s17
-; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s16
+; GFX9-NEXT:    s_mul_i32 s14, s9, s15
+; GFX9-NEXT:    s_add_u32 s14, s10, s14
+; GFX9-NEXT:    s_addc_u32 s15, 0, s11
+; GFX9-NEXT:    s_mul_i32 s10, s6, s15
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s14
 ; GFX9-NEXT:    s_add_i32 s10, s11, s10
-; GFX9-NEXT:    s_mul_i32 s11, s7, s16
-; GFX9-NEXT:    s_add_i32 s18, s10, s11
-; GFX9-NEXT:    s_sub_i32 s12, s9, s18
-; GFX9-NEXT:    s_mul_i32 s10, s6, s16
+; GFX9-NEXT:    s_mul_i32 s11, s7, s14
+; GFX9-NEXT:    s_add_i32 s16, s10, s11
+; GFX9-NEXT:    s_sub_i32 s17, s9, s16
+; GFX9-NEXT:    s_mul_i32 s10, s6, s14
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s19, s12, s7
-; GFX9-NEXT:    s_sub_u32 s20, s8, s6
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_subb_u32 s12, s19, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s12, s7
-; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX9-NEXT:    s_subb_u32 s17, s17, s7
+; GFX9-NEXT:    s_sub_u32 s18, s8, s6
+; GFX9-NEXT:    s_subb_u32 s17, s17, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s17, s7
 ; GFX9-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, s7
-; GFX9-NEXT:    s_cselect_b32 s12, s19, s13
-; GFX9-NEXT:    s_add_u32 s13, s16, 1
-; GFX9-NEXT:    s_addc_u32 s19, s17, 0
-; GFX9-NEXT:    s_add_u32 s20, s16, 2
-; GFX9-NEXT:    s_addc_u32 s21, s17, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b32 s12, s20, s13
-; GFX9-NEXT:    s_cselect_b32 s13, s21, s19
+; GFX9-NEXT:    s_cmp_ge_u32 s18, s6
+; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s17, s7
+; GFX9-NEXT:    s_cselect_b32 s17, s18, s19
+; GFX9-NEXT:    s_add_u32 s18, s14, 1
+; GFX9-NEXT:    s_addc_u32 s19, s15, 0
+; GFX9-NEXT:    s_add_u32 s20, s14, 2
+; GFX9-NEXT:    s_addc_u32 s21, s15, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b32 s17, s20, s18
+; GFX9-NEXT:    s_cselect_b32 s18, s21, s19
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s9, s9, s18
+; GFX9-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX9-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8861,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s6, s6, s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b32 s7, s13, s17
-; GFX9-NEXT:    s_cselect_b32 s6, s12, s16
+; GFX9-NEXT:    s_cselect_b32 s7, s18, s15
+; GFX9-NEXT:    s_cselect_b32 s6, s17, s14
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[6:7], s[2:3]
 ; GFX9-NEXT:    s_sub_u32 s2, s4, s2
 ; GFX9-NEXT:    s_subb_u32 s3, s5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9089,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_addc_u32 s13, 0, s14
 ; GFX6-NEXT:    s_add_u32 s14, s0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s12, s13
 ; GFX6-NEXT:    s_mul_i32 s0, s10, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
@@ -9123,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_add_u32 s13, s14, s0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s12, s10
 ; GFX6-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s10
@@ -9158,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
 ; GFX6-NEXT:    s_mul_i32 s5, s9, s12
-; GFX6-NEXT:    s_add_i32 s13, s4, s5
-; GFX6-NEXT:    s_sub_i32 s14, s7, s13
+; GFX6-NEXT:    s_add_i32 s14, s4, s5
+; GFX6-NEXT:    s_sub_i32 s13, s7, s14
 ; GFX6-NEXT:    s_mul_i32 s4, s8, s12
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s14, s14, s9
-; GFX6-NEXT:    s_sub_u32 s15, s6, s8
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT:    s_subb_u32 s15, s13, s9
+; GFX6-NEXT:    s_sub_u32 s16, s6, s8
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s17, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s15, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s17, s9
+; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s16, s8
+; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, s9
+; GFX6-NEXT:    s_cselect_b32 s18, s19, s18
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s15, s15, s9
+; GFX6-NEXT:    s_sub_u32 s19, s16, s8
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s12, s15, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b32 s13, s19, s16
+; GFX6-NEXT:    s_cselect_b32 s12, s12, s17
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s16, s14, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s16, s9
+; GFX6-NEXT:    s_subb_u32 s4, s7, s14
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s9
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s15, s8
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, s9
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s14, s14, s9
-; GFX6-NEXT:    s_sub_u32 s18, s15, s8
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s4, s14, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s18, s15
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s16
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s5, s7, s13
-; GFX6-NEXT:    s_cmp_ge_u32 s5, s9
-; GFX6-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s8
-; GFX6-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s5, s9
-; GFX6-NEXT:    s_cselect_b32 s7, s8, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s14, s6
+; GFX6-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, s9
+; GFX6-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX6-NEXT:    s_cselect_b32 s5, s12, s4
+; GFX6-NEXT:    s_cselect_b32 s4, s13, s6
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[10:11]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s10
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s10
@@ -9219,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s8, 0, s6
-; GFX9-NEXT:    s_subb_u32 s9, 0, s7
+; GFX9-NEXT:    s_sub_u32 s4, 0, s6
+; GFX9-NEXT:    s_subb_u32 s5, 0, s7
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -9230,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    s_mul_i32 s5, s8, s10
-; GFX9-NEXT:    s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT:    s_mul_i32 s11, s9, s4
-; GFX9-NEXT:    s_add_i32 s5, s12, s5
-; GFX9-NEXT:    s_mul_i32 s13, s8, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s11
-; GFX9-NEXT:    s_mul_hi_u32 s12, s4, s13
-; GFX9-NEXT:    s_mul_i32 s14, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s11, s4, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s12, s4, s9
+; GFX9-NEXT:    s_mul_i32 s11, s5, s9
+; GFX9-NEXT:    s_add_i32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s13, s4, s9
+; GFX9-NEXT:    s_add_i32 s10, s10, s11
+; GFX9-NEXT:    s_mul_hi_u32 s12, s9, s13
+; GFX9-NEXT:    s_mul_i32 s14, s9, s10
+; GFX9-NEXT:    s_mul_hi_u32 s11, s9, s10
 ; GFX9-NEXT:    s_add_u32 s12, s12, s14
 ; GFX9-NEXT:    s_addc_u32 s11, 0, s11
-; GFX9-NEXT:    s_mul_hi_u32 s15, s10, s13
-; GFX9-NEXT:    s_mul_i32 s13, s10, s13
+; GFX9-NEXT:    s_mul_hi_u32 s15, s8, s13
+; GFX9-NEXT:    s_mul_i32 s13, s8, s13
 ; GFX9-NEXT:    s_add_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s5
+; GFX9-NEXT:    s_mul_hi_u32 s14, s8, s10
 ; GFX9-NEXT:    s_addc_u32 s11, s11, s15
 ; GFX9-NEXT:    s_addc_u32 s12, s14, 0
-; GFX9-NEXT:    s_mul_i32 s5, s10, s5
-; GFX9-NEXT:    s_add_u32 s5, s11, s5
+; GFX9-NEXT:    s_mul_i32 s10, s8, s10
+; GFX9-NEXT:    s_add_u32 s10, s11, s10
 ; GFX9-NEXT:    s_addc_u32 s11, 0, s12
-; GFX9-NEXT:    s_add_u32 s12, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s10, s10, s11
-; GFX9-NEXT:    s_mul_i32 s4, s8, s10
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s12
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s9, s9, s12
-; GFX9-NEXT:    s_add_i32 s4, s4, s9
-; GFX9-NEXT:    s_mul_i32 s8, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s8
-; GFX9-NEXT:    s_mul_i32 s11, s10, s8
-; GFX9-NEXT:    s_mul_i32 s14, s12, s4
-; GFX9-NEXT:    s_mul_hi_u32 s8, s12, s8
-; GFX9-NEXT:    s_mul_hi_u32 s13, s12, s4
-; GFX9-NEXT:    s_add_u32 s8, s8, s14
+; GFX9-NEXT:    s_add_u32 s9, s9, s10
+; GFX9-NEXT:    s_addc_u32 s8, s8, s11
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s11, s4, s9
+; GFX9-NEXT:    s_add_i32 s10, s11, s10
+; GFX9-NEXT:    s_mul_i32 s5, s5, s9
+; GFX9-NEXT:    s_add_i32 s10, s10, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s9
+; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s4
+; GFX9-NEXT:    s_mul_i32 s12, s8, s4
+; GFX9-NEXT:    s_mul_i32 s14, s9, s10
+; GFX9-NEXT:    s_mul_hi_u32 s4, s9, s4
+; GFX9-NEXT:    s_mul_hi_u32 s13, s9, s10
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_add_u32 s8, s8, s11
-; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s4
-; GFX9-NEXT:    s_addc_u32 s8, s13, s9
+; GFX9-NEXT:    s_add_u32 s4, s4, s12
+; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s10
+; GFX9-NEXT:    s_addc_u32 s4, s13, s11
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s10, s4
-; GFX9-NEXT:    s_add_u32 s4, s8, s4
-; GFX9-NEXT:    s_addc_u32 s8, 0, s5
-; GFX9-NEXT:    s_add_u32 s9, s12, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s10, s8
+; GFX9-NEXT:    s_mul_i32 s10, s8, s10
+; GFX9-NEXT:    s_add_u32 s4, s4, s10
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s9, s9, s4
+; GFX9-NEXT:    s_addc_u32 s8, s8, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
@@ -9309,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_mul_i32 s8, s6, s8
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s13, s10, s7
 ; GFX9-NEXT:    s_sub_u32 s14, s2, s6
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX9-NEXT:    s_subb_u32 s15, s13, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s15, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
@@ -9322,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_cmp_eq_u32 s15, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s13, s13, s7
-; GFX9-NEXT:    s_sub_u32 s17, s14, s6
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s13, 0
+; GFX9-NEXT:    s_subb_u32 s10, s13, s7
+; GFX9-NEXT:    s_sub_u32 s11, s14, s6
+; GFX9-NEXT:    s_subb_u32 s10, s10, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s11, s17, s14
+; GFX9-NEXT:    s_cselect_b32 s11, s11, s14
 ; GFX9-NEXT:    s_cselect_b32 s10, s10, s15
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s12
@@ -9490,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
 ; GFX6-NEXT:    s_add_u32 s16, s6, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s15
 ; GFX6-NEXT:    s_mul_i32 s6, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
@@ -9524,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s13, s16, s6
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s14, s12
 ; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s6
@@ -9557,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
 ; GFX6-NEXT:    s_add_i32 s13, s14, s13
 ; GFX6-NEXT:    s_mul_i32 s14, s3, s12
-; GFX6-NEXT:    s_add_i32 s14, s13, s14
-; GFX6-NEXT:    s_sub_i32 s15, s9, s14
+; GFX6-NEXT:    s_add_i32 s16, s13, s14
+; GFX6-NEXT:    s_sub_i32 s14, s9, s16
 ; GFX6-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s3
-; GFX6-NEXT:    s_sub_u32 s17, s8, s2
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s18, s15, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s18, s3
-; GFX6-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s17, s2
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, s3
-; GFX6-NEXT:    s_cselect_b32 s19, s19, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s3
-; GFX6-NEXT:    s_sub_u32 s20, s17, s2
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s14, s3
+; GFX6-NEXT:    s_sub_u32 s18, s8, s2
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s19, s14, s15
+; GFX6-NEXT:    s_subb_u32 s19, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s3
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s2
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s3
+; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s17, s17, s3
+; GFX6-NEXT:    s_sub_u32 s21, s18, s2
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s12, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b32 s13, s20, s17
-; GFX6-NEXT:    s_cselect_b32 s12, s12, s18
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s14
+; GFX6-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s3
-; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s2
 ; GFX6-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s14
+; GFX6-NEXT:    s_cselect_b32 s2, s2, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s12, s9
-; GFX6-NEXT:    s_cselect_b32 s2, s13, s8
+; GFX6-NEXT:    s_cselect_b32 s3, s14, s9
+; GFX6-NEXT:    s_cselect_b32 s2, s15, s8
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT:    s_sub_u32 s12, s2, s6
-; GFX6-NEXT:    s_subb_u32 s13, s3, s6
+; GFX6-NEXT:    s_sub_u32 s14, s2, s6
+; GFX6-NEXT:    s_subb_u32 s15, s3, s6
 ; GFX6-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s2
 ; GFX6-NEXT:    s_mov_b32 s3, s2
@@ -9618,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s8, s14
+; GFX6-NEXT:    s_mul_i32 s1, s8, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX6-NEXT:    s_mul_i32 s0, s9, s2
 ; GFX6-NEXT:    s_add_i32 s1, s3, s1
 ; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s15, s8, s2
+; GFX6-NEXT:    s_mul_i32 s13, s8, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GFX6-NEXT:    s_mul_i32 s4, s2, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
 ; GFX6-NEXT:    s_add_u32 s4, s16, s4
 ; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s15, s14, s15
+; GFX6-NEXT:    s_mul_i32 s13, s12, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s15
+; GFX6-NEXT:    s_add_u32 s4, s4, s13
 ; GFX6-NEXT:    s_addc_u32 s4, s5, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s14, s3
+; GFX6-NEXT:    s_mul_i32 s3, s12, s3
 ; GFX6-NEXT:    s_add_u32 s3, s4, s3
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s5
 ; GFX6-NEXT:    s_add_u32 s5, s2, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s4, s14, s4
+; GFX6-NEXT:    s_addc_u32 s4, s12, s4
 ; GFX6-NEXT:    s_mul_i32 s2, s8, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX6-NEXT:    s_add_i32 s2, s3, s2
@@ -9665,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    s_mul_i32 s9, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_add_u32 s9, s15, s9
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v2
+; GFX6-NEXT:    s_add_u32 s9, s13, s9
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
 ; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
+; GFX6-NEXT:    s_addc_u32 s12, 0, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v3
 ; GFX6-NEXT:    s_add_u32 s3, s9, s3
-; GFX6-NEXT:    s_addc_u32 s3, s14, s8
+; GFX6-NEXT:    s_addc_u32 s3, s12, s8
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX6-NEXT:    s_addc_u32 s8, s8, 0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
 ; GFX6-NEXT:    s_add_u32 s2, s3, s2
 ; GFX6-NEXT:    s_addc_u32 s8, 0, s8
-; GFX6-NEXT:    s_add_u32 s14, s5, s2
+; GFX6-NEXT:    s_add_u32 s12, s5, s2
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s15, s4, s8
+; GFX6-NEXT:    s_addc_u32 s13, s4, s8
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_addc_u32 s3, s11, s4
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s15
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s8, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s14
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v2
-; GFX6-NEXT:    s_mul_i32 s2, s8, s15
+; GFX6-NEXT:    s_mul_i32 s2, s8, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s9, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
 ; GFX6-NEXT:    s_add_u32 s2, s11, s2
 ; GFX6-NEXT:    s_addc_u32 s10, 0, s10
-; GFX6-NEXT:    s_mul_i32 s11, s9, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_mul_i32 s11, s9, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    s_add_u32 s2, s2, s11
-; GFX6-NEXT:    s_addc_u32 s2, s10, s14
+; GFX6-NEXT:    s_addc_u32 s2, s10, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v0
 ; GFX6-NEXT:    s_addc_u32 s10, s10, 0
-; GFX6-NEXT:    s_mul_i32 s11, s9, s15
+; GFX6-NEXT:    s_mul_i32 s11, s9, s13
 ; GFX6-NEXT:    s_add_u32 s11, s2, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX6-NEXT:    s_addc_u32 s10, 0, s10
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s10
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
-; GFX6-NEXT:    s_add_i32 s10, s14, s10
-; GFX6-NEXT:    s_mul_i32 s14, s7, s11
-; GFX6-NEXT:    s_add_i32 s14, s10, s14
-; GFX6-NEXT:    s_sub_i32 s15, s9, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX6-NEXT:    s_add_i32 s10, s12, s10
+; GFX6-NEXT:    s_mul_i32 s12, s7, s11
+; GFX6-NEXT:    s_add_i32 s16, s10, s12
+; GFX6-NEXT:    s_sub_i32 s12, s9, s16
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s11
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s7
-; GFX6-NEXT:    s_sub_u32 s17, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s10, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s18, s15, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s18, s7
-; GFX6-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s17, s6
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, s7
-; GFX6-NEXT:    s_cselect_b32 s19, s19, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s7
-; GFX6-NEXT:    s_sub_u32 s20, s17, s6
-; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX6-NEXT:    s_or_b32 s13, s10, s11
+; GFX6-NEXT:    s_subb_u32 s17, s12, s7
+; GFX6-NEXT:    s_sub_u32 s18, s8, s6
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s19, s12, s13
+; GFX6-NEXT:    s_subb_u32 s19, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s6
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s17, s7
+; GFX6-NEXT:    s_sub_u32 s21, s18, s6
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s12, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b32 s13, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s12, s12, s19
 ; GFX6-NEXT:    s_or_b32 s10, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s10, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b32 s11, s20, s17
-; GFX6-NEXT:    s_cselect_b32 s10, s10, s18
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s14
+; GFX6-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX6-NEXT:    s_cselect_b32 s6, s6, s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s10, s9
-; GFX6-NEXT:    s_cselect_b32 s6, s11, s8
+; GFX6-NEXT:    s_cselect_b32 s7, s12, s9
+; GFX6-NEXT:    s_cselect_b32 s6, s13, s8
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s5, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s4, s7, s4
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NEXT:    v_mov_b32_e32 v1, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9780,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT:    s_sub_u32 s12, 0, s2
-; GFX9-NEXT:    s_subb_u32 s13, 0, s3
+; GFX9-NEXT:    s_sub_u32 s6, 0, s2
+; GFX9-NEXT:    s_subb_u32 s7, 0, s3
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9790,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s7, s12, s14
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s6
-; GFX9-NEXT:    s_mul_i32 s15, s13, s6
-; GFX9-NEXT:    s_add_i32 s7, s16, s7
-; GFX9-NEXT:    s_mul_i32 s17, s12, s6
-; GFX9-NEXT:    s_add_i32 s7, s7, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s17
-; GFX9-NEXT:    s_mul_i32 s18, s6, s7
-; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s7
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX9-NEXT:    s_mul_i32 s14, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s13
+; GFX9-NEXT:    s_mul_i32 s15, s7, s13
+; GFX9-NEXT:    s_add_i32 s14, s16, s14
+; GFX9-NEXT:    s_mul_i32 s17, s6, s13
+; GFX9-NEXT:    s_add_i32 s14, s14, s15
+; GFX9-NEXT:    s_mul_hi_u32 s16, s13, s17
+; GFX9-NEXT:    s_mul_i32 s18, s13, s14
+; GFX9-NEXT:    s_mul_hi_u32 s15, s13, s14
 ; GFX9-NEXT:    s_add_u32 s16, s16, s18
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s17
-; GFX9-NEXT:    s_mul_i32 s17, s14, s17
+; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s17
+; GFX9-NEXT:    s_mul_i32 s17, s12, s17
 ; GFX9-NEXT:    s_add_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_hi_u32 s19, s14, s7
+; GFX9-NEXT:    s_mul_hi_u32 s19, s12, s14
 ; GFX9-NEXT:    s_addc_u32 s15, s15, s18
 ; GFX9-NEXT:    s_addc_u32 s16, s19, 0
-; GFX9-NEXT:    s_mul_i32 s7, s14, s7
-; GFX9-NEXT:    s_add_u32 s7, s15, s7
+; GFX9-NEXT:    s_mul_i32 s14, s12, s14
+; GFX9-NEXT:    s_add_u32 s14, s15, s14
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s16
-; GFX9-NEXT:    s_add_u32 s16, s6, s7
-; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT:    s_addc_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_i32 s6, s12, s14
-; GFX9-NEXT:    s_mul_hi_u32 s7, s12, s16
-; GFX9-NEXT:    s_add_i32 s6, s7, s6
-; GFX9-NEXT:    s_mul_i32 s13, s13, s16
-; GFX9-NEXT:    s_add_i32 s6, s6, s13
-; GFX9-NEXT:    s_mul_i32 s12, s12, s16
-; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s12
-; GFX9-NEXT:    s_mul_i32 s15, s14, s12
-; GFX9-NEXT:    s_mul_i32 s18, s16, s6
-; GFX9-NEXT:    s_mul_hi_u32 s12, s16, s12
-; GFX9-NEXT:    s_mul_hi_u32 s17, s16, s6
-; GFX9-NEXT:    s_add_u32 s12, s12, s18
+; GFX9-NEXT:    s_add_u32 s13, s13, s14
+; GFX9-NEXT:    s_addc_u32 s12, s12, s15
+; GFX9-NEXT:    s_mul_i32 s14, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s13
+; GFX9-NEXT:    s_add_i32 s14, s15, s14
+; GFX9-NEXT:    s_mul_i32 s7, s7, s13
+; GFX9-NEXT:    s_add_i32 s14, s14, s7
+; GFX9-NEXT:    s_mul_i32 s6, s6, s13
+; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s6
+; GFX9-NEXT:    s_mul_i32 s16, s12, s6
+; GFX9-NEXT:    s_mul_i32 s18, s13, s14
+; GFX9-NEXT:    s_mul_hi_u32 s6, s13, s6
+; GFX9-NEXT:    s_mul_hi_u32 s17, s13, s14
+; GFX9-NEXT:    s_add_u32 s6, s6, s18
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_add_u32 s12, s12, s15
-; GFX9-NEXT:    s_mul_hi_u32 s7, s14, s6
-; GFX9-NEXT:    s_addc_u32 s12, s17, s13
+; GFX9-NEXT:    s_add_u32 s6, s6, s16
+; GFX9-NEXT:    s_mul_hi_u32 s7, s12, s14
+; GFX9-NEXT:    s_addc_u32 s6, s17, s15
 ; GFX9-NEXT:    s_addc_u32 s7, s7, 0
-; GFX9-NEXT:    s_mul_i32 s6, s14, s6
-; GFX9-NEXT:    s_add_u32 s6, s12, s6
-; GFX9-NEXT:    s_addc_u32 s12, 0, s7
-; GFX9-NEXT:    s_add_u32 s13, s16, s6
-; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT:    s_addc_u32 s12, s14, s12
+; GFX9-NEXT:    s_mul_i32 s14, s12, s14
+; GFX9-NEXT:    s_add_u32 s6, s6, s14
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
+; GFX9-NEXT:    s_add_u32 s13, s13, s6
+; GFX9-NEXT:    s_addc_u32 s12, s12, s7
 ; GFX9-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX9-NEXT:    s_add_u32 s8, s8, s6
 ; GFX9-NEXT:    s_mov_b32 s7, s6
@@ -9868,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
 ; GFX9-NEXT:    s_subb_u32 s17, s14, s3
 ; GFX9-NEXT:    s_sub_u32 s18, s8, s2
 ; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX9-NEXT:    s_subb_u32 s19, s17, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s19, s3
 ; GFX9-NEXT:    s_cselect_b32 s20, -1, 0
@@ -9881,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s19, s3
 ; GFX9-NEXT:    s_cselect_b32 s20, s21, s20
 ; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s17, s17, s3
-; GFX9-NEXT:    s_sub_u32 s21, s18, s2
-; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s14, s17, 0
+; GFX9-NEXT:    s_subb_u32 s14, s17, s3
+; GFX9-NEXT:    s_sub_u32 s15, s18, s2
+; GFX9-NEXT:    s_subb_u32 s14, s14, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX9-NEXT:    s_cselect_b32 s15, s15, s18
 ; GFX9-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
 ; GFX9-NEXT:    s_subb_u32 s9, s9, s16
@@ -9911,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s6, 0, s2
-; GFX9-NEXT:    s_subb_u32 s7, 0, s3
+; GFX9-NEXT:    s_sub_u32 s4, 0, s2
+; GFX9-NEXT:    s_subb_u32 s5, 0, s3
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -9922,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s4
-; GFX9-NEXT:    s_mul_i32 s14, s6, s9
-; GFX9-NEXT:    s_mul_i32 s5, s7, s4
+; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT:    s_mul_i32 s14, s4, s9
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
 ; GFX9-NEXT:    s_add_i32 s8, s8, s14
-; GFX9-NEXT:    s_add_i32 s8, s8, s5
-; GFX9-NEXT:    s_mul_i32 s15, s6, s4
-; GFX9-NEXT:    s_mul_i32 s14, s4, s8
-; GFX9-NEXT:    s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s8
+; GFX9-NEXT:    s_add_i32 s8, s8, s7
+; GFX9-NEXT:    s_mul_i32 s15, s4, s6
+; GFX9-NEXT:    s_mul_i32 s14, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s15
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s8
 ; GFX9-NEXT:    s_add_u32 s14, s16, s14
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s15
 ; GFX9-NEXT:    s_mul_i32 s15, s9, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
 ; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s8
-; GFX9-NEXT:    s_addc_u32 s5, s5, s17
+; GFX9-NEXT:    s_addc_u32 s7, s7, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
 ; GFX9-NEXT:    s_mul_i32 s8, s9, s8
-; GFX9-NEXT:    s_add_u32 s5, s5, s8
+; GFX9-NEXT:    s_add_u32 s7, s7, s8
 ; GFX9-NEXT:    s_addc_u32 s8, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s4, s6, s8
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s14
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s7, s7, s14
-; GFX9-NEXT:    s_add_i32 s4, s4, s7
-; GFX9-NEXT:    s_mul_i32 s6, s6, s14
-; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s6
-; GFX9-NEXT:    s_mul_i32 s9, s8, s6
-; GFX9-NEXT:    s_mul_i32 s16, s14, s4
-; GFX9-NEXT:    s_mul_hi_u32 s6, s14, s6
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT:    s_add_u32 s6, s6, s16
+; GFX9-NEXT:    s_add_u32 s6, s6, s7
+; GFX9-NEXT:    s_addc_u32 s7, s9, s8
+; GFX9-NEXT:    s_mul_i32 s8, s4, s7
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s6
+; GFX9-NEXT:    s_add_i32 s8, s9, s8
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_add_i32 s8, s8, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s6
+; GFX9-NEXT:    s_mul_hi_u32 s9, s7, s4
+; GFX9-NEXT:    s_mul_i32 s14, s7, s4
+; GFX9-NEXT:    s_mul_i32 s16, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s6, s4
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s8
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s6, s6, s9
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s4
-; GFX9-NEXT:    s_addc_u32 s6, s15, s7
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s8
+; GFX9-NEXT:    s_addc_u32 s4, s15, s9
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s8, s4
-; GFX9-NEXT:    s_add_u32 s4, s6, s4
-; GFX9-NEXT:    s_addc_u32 s6, 0, s5
-; GFX9-NEXT:    s_add_u32 s9, s14, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s8, s6
+; GFX9-NEXT:    s_mul_i32 s8, s7, s8
+; GFX9-NEXT:    s_add_u32 s4, s4, s8
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s8, s6, s4
+; GFX9-NEXT:    s_addc_u32 s9, s7, s5
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    s_add_u32 s6, s10, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s7, s11, s4
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT:    s_mul_i32 s11, s6, s8
-; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s9
-; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s8
+; GFX9-NEXT:    s_mul_i32 s11, s6, s9
+; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s9
 ; GFX9-NEXT:    s_add_u32 s11, s14, s11
 ; GFX9-NEXT:    s_addc_u32 s10, 0, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s9
-; GFX9-NEXT:    s_mul_i32 s9, s7, s9
-; GFX9-NEXT:    s_add_u32 s9, s11, s9
-; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s8
-; GFX9-NEXT:    s_addc_u32 s9, s10, s15
-; GFX9-NEXT:    s_addc_u32 s10, s14, 0
+; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s8
 ; GFX9-NEXT:    s_mul_i32 s8, s7, s8
-; GFX9-NEXT:    s_add_u32 s8, s9, s8
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s9
+; GFX9-NEXT:    s_addc_u32 s8, s10, s15
+; GFX9-NEXT:    s_addc_u32 s10, s14, 0
+; GFX9-NEXT:    s_mul_i32 s9, s7, s9
+; GFX9-NEXT:    s_add_u32 s8, s8, s9
 ; GFX9-NEXT:    s_addc_u32 s9, 0, s10
 ; GFX9-NEXT:    s_mul_i32 s9, s2, s9
 ; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s8
@@ -10000,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX9-NEXT:    s_sub_u32 s6, s6, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s15, s10, s3
 ; GFX9-NEXT:    s_sub_u32 s16, s6, s2
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX9-NEXT:    s_subb_u32 s17, s15, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s17, s3
 ; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
@@ -10013,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s17, s3
 ; GFX9-NEXT:    s_cselect_b32 s18, s19, s18
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s15, s15, s3
-; GFX9-NEXT:    s_sub_u32 s19, s16, s2
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s15, 0
+; GFX9-NEXT:    s_subb_u32 s10, s15, s3
+; GFX9-NEXT:    s_sub_u32 s11, s16, s2
+; GFX9-NEXT:    s_subb_u32 s10, s10, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b32 s11, s19, s16
+; GFX9-NEXT:    s_cselect_b32 s11, s11, s16
 ; GFX9-NEXT:    s_cselect_b32 s10, s10, s17
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s7, s7, s14
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 394727c..01f4414 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 258bc295..9db6d70 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s6, s6, s3
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s3
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s7, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1264_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
@@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s2
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s10
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[8:9]
 ; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s12, s12, s3
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s12, s12, s3
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s6
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s2
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s12, s12, s7
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s8, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s8, s8, s2
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2
 ; GFX1164_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s6, s[0:1]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s6
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s2
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_add_i32 s12, s12, s7
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v2, s8, s1
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s8, s8, s2
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1264_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
@@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s2
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s8, s8, s3
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s8, s8, s2
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s2
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s8, s8, s3
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s1
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s1
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s8, s8, s2
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s10
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[8:9]
 ; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 23c5f4f..6167a84 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s4
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_and_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_and_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_or_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_or_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_max_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_max_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_min_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_min_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_max_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_max_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_min_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_min_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index e4def28..9afc0c6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 39a3c9a..10fd34f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 4a6fa4f..b96de17 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_add_u32 s4, s4, s6
 ; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; CISI-NEXT:    s_or_b32 s6, s12, s13
-; CISI-NEXT:    s_cmp_lg_u32 s6, 0
 ; CISI-NEXT:    s_addc_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_add_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_addc_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_addc_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_add_u32 s0, s12, s14
+; GFX9-NEXT:    s_addc_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-NEXT:    s_add_u32 s0, s12, s14
-; GFX1010-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1010-NEXT:    s_addc_u32 s1, s13, s15
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W32-NEXT:    s_add_u32 s4, s4, s6
-; GFX1030W32-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX1030W32-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W32-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W64-NEXT:    s_add_u32 s4, s4, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX1030W64-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W64-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_add_co_u32 s0, s12, s14
-; GFX1250-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
 ; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_sub_u32 s4, s4, s6
 ; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; CISI-NEXT:    s_or_b32 s6, s12, s13
-; CISI-NEXT:    s_cmp_lg_u32 s6, 0
 ; CISI-NEXT:    s_subb_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_sub_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_subb_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_subb_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_sub_u32 s0, s12, s14
+; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-NEXT:    s_sub_u32 s0, s12, s14
-; GFX1010-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1010-NEXT:    s_subb_u32 s1, s13, s15
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W32-NEXT:    s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W32-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W64-NEXT:    s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX1030W64-NEXT:    s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W64-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_subb_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
 ; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; VI-NEXT:    s_addc_u32 s6, s7, s9
 ; VI-NEXT:    s_addc_u32 s8, s8, 0
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
-; VI-NEXT:    s_add_u32 s12, s6, s7
-; VI-NEXT:    v_mov_b32_e32 v0, s12
+; VI-NEXT:    s_add_u32 s10, s6, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s10
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0
-; VI-NEXT:    s_addc_u32 s13, 0, s8
-; VI-NEXT:    s_mul_i32 s8, s4, s13
+; VI-NEXT:    s_addc_u32 s11, 0, s8
+; VI-NEXT:    s_mul_i32 s8, s4, s11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v1
 ; VI-NEXT:    s_add_i32 s8, s9, s8
-; VI-NEXT:    s_mul_i32 s9, s5, s12
-; VI-NEXT:    s_add_i32 s14, s8, s9
-; VI-NEXT:    s_sub_i32 s10, s3, s14
+; VI-NEXT:    s_mul_i32 s9, s5, s10
+; VI-NEXT:    s_add_i32 s12, s8, s9
+; VI-NEXT:    s_sub_i32 s13, s3, s12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v0
-; VI-NEXT:    s_sub_u32 s15, s2, s8
+; VI-NEXT:    s_sub_u32 s14, s2, s8
 ; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT:    s_subb_u32 s16, s10, s5
-; VI-NEXT:    s_sub_u32 s17, s15, s4
-; VI-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; VI-NEXT:    s_subb_u32 s10, s16, 0
-; VI-NEXT:    s_cmp_ge_u32 s10, s5
-; VI-NEXT:    s_cselect_b32 s11, -1, 0
-; VI-NEXT:    s_cmp_ge_u32 s17, s4
+; VI-NEXT:    s_subb_u32 s13, s13, s5
+; VI-NEXT:    s_sub_u32 s15, s14, s4
+; VI-NEXT:    s_subb_u32 s13, s13, 0
+; VI-NEXT:    s_cmp_ge_u32 s13, s5
 ; VI-NEXT:    s_cselect_b32 s16, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s10, s5
-; VI-NEXT:    s_cselect_b32 s10, s16, s11
-; VI-NEXT:    s_add_u32 s11, s12, 1
-; VI-NEXT:    s_addc_u32 s16, s13, 0
-; VI-NEXT:    s_add_u32 s17, s12, 2
-; VI-NEXT:    s_addc_u32 s18, s13, 0
-; VI-NEXT:    s_cmp_lg_u32 s10, 0
-; VI-NEXT:    s_cselect_b32 s10, s17, s11
-; VI-NEXT:    s_cselect_b32 s11, s18, s16
+; VI-NEXT:    s_cmp_ge_u32 s15, s4
+; VI-NEXT:    s_cselect_b32 s15, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s13, s5
+; VI-NEXT:    s_cselect_b32 s13, s15, s16
+; VI-NEXT:    s_add_u32 s15, s10, 1
+; VI-NEXT:    s_addc_u32 s16, s11, 0
+; VI-NEXT:    s_add_u32 s17, s10, 2
+; VI-NEXT:    s_addc_u32 s18, s11, 0
+; VI-NEXT:    s_cmp_lg_u32 s13, 0
+; VI-NEXT:    s_cselect_b32 s13, s17, s15
+; VI-NEXT:    s_cselect_b32 s15, s18, s16
 ; VI-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT:    s_subb_u32 s3, s3, s14
+; VI-NEXT:    s_subb_u32 s3, s3, s12
 ; VI-NEXT:    s_cmp_ge_u32 s3, s5
 ; VI-NEXT:    s_cselect_b32 s8, -1, 0
-; VI-NEXT:    s_cmp_ge_u32 s15, s4
+; VI-NEXT:    s_cmp_ge_u32 s14, s4
 ; VI-NEXT:    s_cselect_b32 s9, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s3, s5
 ; VI-NEXT:    s_cselect_b32 s3, s9, s8
 ; VI-NEXT:    s_cmp_lg_u32 s3, 0
-; VI-NEXT:    s_cselect_b32 s9, s11, s13
-; VI-NEXT:    s_cselect_b32 s8, s10, s12
+; VI-NEXT:    s_cselect_b32 s9, s15, s11
+; VI-NEXT:    s_cselect_b32 s8, s13, s10
 ; VI-NEXT:    s_cbranch_execnz .LBB16_4
 ; VI-NEXT:  .LBB16_2:
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_u32 s10, 0, s6
-; GFX9-NEXT:    s_subb_u32 s11, 0, s7
+; GFX9-NEXT:    s_sub_u32 s8, 0, s6
+; GFX9-NEXT:    s_subb_u32 s9, 0, s7
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX9-NEXT:    s_mul_i32 s9, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s8
-; GFX9-NEXT:    s_mul_i32 s13, s11, s8
-; GFX9-NEXT:    s_add_i32 s9, s14, s9
-; GFX9-NEXT:    s_add_i32 s9, s9, s13
-; GFX9-NEXT:    s_mul_i32 s15, s10, s8
-; GFX9-NEXT:    s_mul_i32 s14, s8, s9
-; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s15
-; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s9
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v0
+; GFX9-NEXT:    s_mul_i32 s12, s8, s10
+; GFX9-NEXT:    s_mul_hi_u32 s14, s8, s11
+; GFX9-NEXT:    s_mul_i32 s13, s9, s11
+; GFX9-NEXT:    s_add_i32 s12, s14, s12
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_i32 s15, s8, s11
+; GFX9-NEXT:    s_mul_i32 s14, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s16, s11, s15
+; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
 ; GFX9-NEXT:    s_add_u32 s14, s16, s14
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT:    s_mul_i32 s15, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT:    s_mul_i32 s15, s10, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s9
+; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s12
 ; GFX9-NEXT:    s_addc_u32 s13, s13, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
-; GFX9-NEXT:    s_mul_i32 s9, s12, s9
-; GFX9-NEXT:    s_add_u32 s9, s13, s9
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s12, s13, s12
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s8, s9
-; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_i32 s8, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s14
-; GFX9-NEXT:    s_add_i32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s11, s11, s14
-; GFX9-NEXT:    s_add_i32 s8, s8, s11
-; GFX9-NEXT:    s_mul_i32 s10, s10, s14
-; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT:    s_mul_i32 s13, s12, s10
-; GFX9-NEXT:    s_mul_i32 s16, s14, s8
-; GFX9-NEXT:    s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s8
-; GFX9-NEXT:    s_add_u32 s10, s10, s16
+; GFX9-NEXT:    s_add_u32 s11, s11, s12
+; GFX9-NEXT:    s_addc_u32 s10, s10, s13
+; GFX9-NEXT:    s_mul_i32 s12, s8, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s11
+; GFX9-NEXT:    s_add_i32 s12, s13, s12
+; GFX9-NEXT:    s_mul_i32 s9, s9, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s9
+; GFX9-NEXT:    s_mul_i32 s8, s8, s11
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s8
+; GFX9-NEXT:    s_mul_i32 s14, s10, s8
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT:    s_add_u32 s8, s8, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s10, s10, s13
-; GFX9-NEXT:    s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT:    s_addc_u32 s10, s15, s11
+; GFX9-NEXT:    s_add_u32 s8, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s12
+; GFX9-NEXT:    s_addc_u32 s8, s15, s13
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    s_mul_i32 s8, s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s10, s8
-; GFX9-NEXT:    s_addc_u32 s10, 0, s9
-; GFX9-NEXT:    s_add_u32 s11, s14, s8
-; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_addc_u32 s8, s12, s10
-; GFX9-NEXT:    s_mul_i32 s10, s2, s8
-; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s11
-; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT:    s_add_u32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s8, s8, s12
 ; GFX9-NEXT:    s_addc_u32 s9, 0, s9
-; GFX9-NEXT:    s_mul_hi_u32 s13, s3, s11
-; GFX9-NEXT:    s_mul_i32 s11, s3, s11
-; GFX9-NEXT:    s_add_u32 s10, s10, s11
-; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s8
-; GFX9-NEXT:    s_addc_u32 s9, s9, s13
-; GFX9-NEXT:    s_addc_u32 s10, s12, 0
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_addc_u32 s9, s10, s9
+; GFX9-NEXT:    s_mul_i32 s11, s2, s9
+; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s9
+; GFX9-NEXT:    s_add_u32 s11, s12, s11
+; GFX9-NEXT:    s_addc_u32 s10, 0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s3, s8
 ; GFX9-NEXT:    s_mul_i32 s8, s3, s8
-; GFX9-NEXT:    s_add_u32 s12, s9, s8
-; GFX9-NEXT:    s_addc_u32 s13, 0, s10
-; GFX9-NEXT:    s_mul_i32 s8, s6, s13
-; GFX9-NEXT:    s_mul_hi_u32 s9, s6, s12
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s9
+; GFX9-NEXT:    s_addc_u32 s8, s10, s13
+; GFX9-NEXT:    s_addc_u32 s10, s12, 0
+; GFX9-NEXT:    s_mul_i32 s9, s3, s9
+; GFX9-NEXT:    s_add_u32 s11, s8, s9
+; GFX9-NEXT:    s_addc_u32 s10, 0, s10
+; GFX9-NEXT:    s_mul_i32 s8, s6, s10
+; GFX9-NEXT:    s_mul_hi_u32 s9, s6, s11
 ; GFX9-NEXT:    s_add_i32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s9, s7, s12
-; GFX9-NEXT:    s_add_i32 s14, s8, s9
-; GFX9-NEXT:    s_sub_i32 s10, s3, s14
-; GFX9-NEXT:    s_mul_i32 s8, s6, s12
-; GFX9-NEXT:    s_sub_u32 s15, s2, s8
+; GFX9-NEXT:    s_mul_i32 s9, s7, s11
+; GFX9-NEXT:    s_add_i32 s12, s8, s9
+; GFX9-NEXT:    s_sub_i32 s13, s3, s12
+; GFX9-NEXT:    s_mul_i32 s8, s6, s11
+; GFX9-NEXT:    s_sub_u32 s14, s2, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_subb_u32 s16, s10, s7
-; GFX9-NEXT:    s_sub_u32 s17, s15, s6
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s16, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s10, s7
-; GFX9-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s17, s6
+; GFX9-NEXT:    s_subb_u32 s13, s13, s7
+; GFX9-NEXT:    s_sub_u32 s15, s14, s6
+; GFX9-NEXT:    s_subb_u32 s13, s13, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s13, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, s7
-; GFX9-NEXT:    s_cselect_b32 s10, s16, s11
-; GFX9-NEXT:    s_add_u32 s11, s12, 1
-; GFX9-NEXT:    s_addc_u32 s16, s13, 0
-; GFX9-NEXT:    s_add_u32 s17, s12, 2
-; GFX9-NEXT:    s_addc_u32 s18, s13, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX9-NEXT:    s_cselect_b32 s10, s17, s11
-; GFX9-NEXT:    s_cselect_b32 s11, s18, s16
+; GFX9-NEXT:    s_cmp_ge_u32 s15, s6
+; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s13, s7
+; GFX9-NEXT:    s_cselect_b32 s13, s15, s16
+; GFX9-NEXT:    s_add_u32 s15, s11, 1
+; GFX9-NEXT:    s_addc_u32 s16, s10, 0
+; GFX9-NEXT:    s_add_u32 s17, s11, 2
+; GFX9-NEXT:    s_addc_u32 s18, s10, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cselect_b32 s13, s17, s15
+; GFX9-NEXT:    s_cselect_b32 s15, s18, s16
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_subb_u32 s3, s3, s14
+; GFX9-NEXT:    s_subb_u32 s3, s3, s12
 ; GFX9-NEXT:    s_cmp_ge_u32 s3, s7
 ; GFX9-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s15, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s14, s6
 ; GFX9-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s3, s7
 ; GFX9-NEXT:    s_cselect_b32 s3, s9, s8
 ; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX9-NEXT:    s_cselect_b32 s9, s11, s13
-; GFX9-NEXT:    s_cselect_b32 s8, s10, s12
+; GFX9-NEXT:    s_cselect_b32 s9, s15, s10
+; GFX9-NEXT:    s_cselect_b32 s8, s13, s11
 ; GFX9-NEXT:    s_cbranch_execnz .LBB16_3
 ; GFX9-NEXT:  .LBB16_2:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
@@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1010-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1010-NEXT:    s_add_u32 s8, s8, s11
-; GFX1010-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1010-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1010-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1010-NEXT:    s_addc_u32 s5, s5, s12
-; GFX1010-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1010-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1010-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1010-NEXT:    s_mul_i32 s9, s9, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1010-NEXT:    s_add_i32 s9, s13, s9
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s11
+; GFX1010-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1010-NEXT:    s_add_i32 s9, s11, s9
+; GFX1010-NEXT:    s_mul_i32 s11, s5, s12
 ; GFX1010-NEXT:    s_add_i32 s9, s9, s10
-; GFX1010-NEXT:    s_mul_i32 s10, s5, s11
+; GFX1010-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1010-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1010-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1010-NEXT:    s_add_u32 s12, s12, s15
+; GFX1010-NEXT:    s_add_u32 s10, s10, s15
+; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s12
 ; GFX1010-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1010-NEXT:    s_mul_hi_u32 s11, s5, s9
-; GFX1010-NEXT:    s_add_u32 s10, s12, s10
+; GFX1010-NEXT:    s_mul_hi_u32 s12, s5, s9
+; GFX1010-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1010-NEXT:    s_mul_i32 s9, s5, s9
 ; GFX1010-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1010-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1010-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1010-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1010-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1010-NEXT:    s_add_u32 s8, s8, s9
-; GFX1010-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1010-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1010-NEXT:    s_addc_u32 s5, s5, s10
-; GFX1010-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1010-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1010-NEXT:    s_mul_i32 s12, s2, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s10, s2, s5
-; GFX1010-NEXT:    s_add_u32 s11, s11, s12
-; GFX1010-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1010-NEXT:    s_mul_hi_u32 s11, s2, s5
+; GFX1010-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1010-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1010-NEXT:    s_add_u32 s9, s9, s12
+; GFX1010-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1010-NEXT:    s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT:    s_add_u32 s8, s11, s8
+; GFX1010-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1010-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1010-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1010-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1010-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1010-NEXT:    s_add_u32 s5, s8, s5
 ; GFX1010-NEXT:    s_addc_u32 s8, 0, s9
@@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1010-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1010-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1010-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1010-NEXT:    s_subb_u32 s11, s11, s7
 ; GFX1010-NEXT:    s_sub_u32 s13, s10, s6
-; GFX1010-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1010-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1010-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1010-NEXT:    s_cmp_ge_u32 s11, s7
 ; GFX1010-NEXT:    s_cselect_b32 s14, -1, 0
@@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1030W32-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1030W32-NEXT:    s_add_u32 s8, s8, s11
-; GFX1030W32-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1030W32-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1030W32-NEXT:    s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1030W32-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s9, s9, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1030W32-NEXT:    s_add_i32 s9, s13, s9
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s7, s11
+; GFX1030W32-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT:    s_add_i32 s9, s11, s9
+; GFX1030W32-NEXT:    s_mul_i32 s11, s7, s12
 ; GFX1030W32-NEXT:    s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT:    s_mul_i32 s10, s7, s11
+; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1030W32-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1030W32-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1030W32-NEXT:    s_add_u32 s12, s12, s15
+; GFX1030W32-NEXT:    s_add_u32 s10, s10, s15
+; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s7, s12
 ; GFX1030W32-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s7, s9
-; GFX1030W32-NEXT:    s_add_u32 s10, s12, s10
+; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s7, s9
+; GFX1030W32-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1030W32-NEXT:    s_mul_i32 s9, s7, s9
 ; GFX1030W32-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1030W32-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1030W32-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1030W32-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1030W32-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1030W32-NEXT:    s_add_u32 s8, s8, s9
-; GFX1030W32-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1030W32-NEXT:    s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s2, s7
-; GFX1030W32-NEXT:    s_add_u32 s11, s11, s12
-; GFX1030W32-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s2, s7
+; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1030W32-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT:    s_add_u32 s9, s9, s12
+; GFX1030W32-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT:    s_add_u32 s8, s11, s8
+; GFX1030W32-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1030W32-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1030W32-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1030W32-NEXT:    s_add_u32 s7, s8, s7
 ; GFX1030W32-NEXT:    s_addc_u32 s8, 0, s9
@@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1030W32-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1030W32-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s11, s11, s5
 ; GFX1030W32-NEXT:    s_sub_u32 s13, s10, s4
-; GFX1030W32-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1030W32-NEXT:    s_cmp_ge_u32 s11, s5
 ; GFX1030W32-NEXT:    s_cselect_b32 s14, -1, 0
@@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:  ; %bb.1:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX1030W64-NEXT:    s_sub_u32 s9, 0, s4
-; GFX1030W64-NEXT:    s_subb_u32 s10, 0, s5
+; GFX1030W64-NEXT:    s_sub_u32 s8, 0, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, 0, s5
 ; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
 ; GFX1030W64-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1030W64-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
 ; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1030W64-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX1030W64-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1030W64-NEXT:    s_mul_i32 s7, s9, s8
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s9, s6
-; GFX1030W64-NEXT:    s_mul_i32 s11, s10, s6
-; GFX1030W64-NEXT:    s_add_i32 s7, s12, s7
-; GFX1030W64-NEXT:    s_mul_i32 s13, s9, s6
-; GFX1030W64-NEXT:    s_add_i32 s7, s7, s11
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s6, s13
-; GFX1030W64-NEXT:    s_mul_i32 s15, s6, s7
-; GFX1030W64-NEXT:    s_mul_hi_u32 s14, s8, s13
-; GFX1030W64-NEXT:    s_mul_i32 s11, s8, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s6, s7
+; GFX1030W64-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX1030W64-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX1030W64-NEXT:    s_mul_i32 s10, s8, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s9, s7
+; GFX1030W64-NEXT:    s_add_i32 s10, s12, s10
+; GFX1030W64-NEXT:    s_mul_i32 s13, s8, s7
+; GFX1030W64-NEXT:    s_add_i32 s10, s10, s11
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s7, s13
+; GFX1030W64-NEXT:    s_mul_i32 s15, s7, s10
+; GFX1030W64-NEXT:    s_mul_hi_u32 s14, s6, s13
+; GFX1030W64-NEXT:    s_mul_i32 s11, s6, s13
+; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s7, s10
 ; GFX1030W64-NEXT:    s_add_u32 s12, s12, s15
 ; GFX1030W64-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s16, s8, s7
+; GFX1030W64-NEXT:    s_mul_hi_u32 s16, s6, s10
 ; GFX1030W64-NEXT:    s_add_u32 s11, s12, s11
-; GFX1030W64-NEXT:    s_mul_i32 s7, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s10, s6, s10
 ; GFX1030W64-NEXT:    s_addc_u32 s11, s13, s14
 ; GFX1030W64-NEXT:    s_addc_u32 s12, s16, 0
-; GFX1030W64-NEXT:    s_add_u32 s7, s11, s7
+; GFX1030W64-NEXT:    s_add_u32 s10, s11, s10
 ; GFX1030W64-NEXT:    s_addc_u32 s11, 0, s12
-; GFX1030W64-NEXT:    s_add_u32 s12, s6, s7
-; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s9, s12
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_mul_i32 s6, s9, s12
-; GFX1030W64-NEXT:    s_addc_u32 s8, s8, s11
-; GFX1030W64-NEXT:    s_mul_i32 s10, s10, s12
-; GFX1030W64-NEXT:    s_mul_i32 s9, s9, s8
-; GFX1030W64-NEXT:    s_mul_hi_u32 s7, s12, s6
-; GFX1030W64-NEXT:    s_add_i32 s9, s13, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s8, s6
-; GFX1030W64-NEXT:    s_add_i32 s9, s9, s10
-; GFX1030W64-NEXT:    s_mul_i32 s6, s8, s6
-; GFX1030W64-NEXT:    s_mul_i32 s14, s12, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s12, s9
-; GFX1030W64-NEXT:    s_add_u32 s7, s7, s14
+; GFX1030W64-NEXT:    s_add_u32 s7, s7, s10
+; GFX1030W64-NEXT:    s_addc_u32 s6, s6, s11
+; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s8, s8, s6
+; GFX1030W64-NEXT:    s_mul_i32 s9, s9, s7
+; GFX1030W64-NEXT:    s_add_i32 s8, s10, s8
+; GFX1030W64-NEXT:    s_mul_i32 s10, s6, s11
+; GFX1030W64-NEXT:    s_add_i32 s8, s8, s9
+; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s7, s11
+; GFX1030W64-NEXT:    s_mul_i32 s14, s7, s8
+; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s7, s8
+; GFX1030W64-NEXT:    s_add_u32 s9, s9, s14
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s6, s11
 ; GFX1030W64-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s8, s9
-; GFX1030W64-NEXT:    s_add_u32 s6, s7, s6
-; GFX1030W64-NEXT:    s_mul_i32 s9, s8, s9
-; GFX1030W64-NEXT:    s_addc_u32 s6, s13, s11
-; GFX1030W64-NEXT:    s_addc_u32 s7, s10, 0
-; GFX1030W64-NEXT:    s_add_u32 s6, s6, s9
-; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s7
-; GFX1030W64-NEXT:    s_add_u32 s10, s12, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s2, s10
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s6, s3, s10
-; GFX1030W64-NEXT:    s_addc_u32 s7, s8, s9
-; GFX1030W64-NEXT:    s_mul_i32 s8, s3, s10
-; GFX1030W64-NEXT:    s_mul_i32 s10, s2, s7
-; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s2, s7
-; GFX1030W64-NEXT:    s_add_u32 s10, s11, s10
-; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s3, s7
-; GFX1030W64-NEXT:    s_add_u32 s8, s10, s8
+; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s6, s8
+; GFX1030W64-NEXT:    s_add_u32 s9, s9, s10
+; GFX1030W64-NEXT:    s_mul_i32 s8, s6, s8
+; GFX1030W64-NEXT:    s_addc_u32 s9, s13, s12
+; GFX1030W64-NEXT:    s_addc_u32 s10, s11, 0
+; GFX1030W64-NEXT:    s_add_u32 s8, s9, s8
+; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s10
+; GFX1030W64-NEXT:    s_add_u32 s7, s7, s8
+; GFX1030W64-NEXT:    s_addc_u32 s6, s6, s9
+; GFX1030W64-NEXT:    s_mul_hi_u32 s8, s2, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s2, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s2, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s3, s7
 ; GFX1030W64-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1030W64-NEXT:    s_addc_u32 s6, s9, s6
+; GFX1030W64-NEXT:    s_add_u32 s8, s8, s11
+; GFX1030W64-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s3, s6
+; GFX1030W64-NEXT:    s_add_u32 s7, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s6, s3, s6
+; GFX1030W64-NEXT:    s_addc_u32 s7, s10, s9
 ; GFX1030W64-NEXT:    s_addc_u32 s8, s12, 0
-; GFX1030W64-NEXT:    s_add_u32 s10, s6, s7
+; GFX1030W64-NEXT:    s_add_u32 s10, s7, s6
 ; GFX1030W64-NEXT:    s_addc_u32 s11, 0, s8
 ; GFX1030W64-NEXT:    s_mul_hi_u32 s6, s4, s10
 ; GFX1030W64-NEXT:    s_mul_i32 s7, s4, s11
 ; GFX1030W64-NEXT:    s_mul_i32 s8, s5, s10
 ; GFX1030W64-NEXT:    s_add_i32 s6, s6, s7
-; GFX1030W64-NEXT:    s_add_i32 s12, s6, s8
+; GFX1030W64-NEXT:    s_add_i32 s8, s6, s8
 ; GFX1030W64-NEXT:    s_mul_i32 s6, s4, s10
-; GFX1030W64-NEXT:    s_sub_i32 s8, s3, s12
-; GFX1030W64-NEXT:    s_sub_u32 s13, s2, s6
+; GFX1030W64-NEXT:    s_sub_i32 s9, s3, s8
+; GFX1030W64-NEXT:    s_sub_u32 s12, s2, s6
 ; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_subb_u32 s14, s8, s5
-; GFX1030W64-NEXT:    s_sub_u32 s15, s13, s4
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX1030W64-NEXT:    s_subb_u32 s8, s14, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s8, s5
-; GFX1030W64-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s15, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, s9, s5
+; GFX1030W64-NEXT:    s_sub_u32 s13, s12, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, s9, 0
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s9, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1030W64-NEXT:    s_cmp_eq_u32 s8, s5
-; GFX1030W64-NEXT:    s_cselect_b32 s8, s14, s9
-; GFX1030W64-NEXT:    s_add_u32 s9, s10, 1
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX1030W64-NEXT:    s_cmp_eq_u32 s9, s5
+; GFX1030W64-NEXT:    s_cselect_b32 s9, s13, s14
+; GFX1030W64-NEXT:    s_add_u32 s13, s10, 1
 ; GFX1030W64-NEXT:    s_addc_u32 s14, s11, 0
 ; GFX1030W64-NEXT:    s_add_u32 s15, s10, 2
 ; GFX1030W64-NEXT:    s_addc_u32 s16, s11, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX1030W64-NEXT:    s_cselect_b32 s15, s15, s9
+; GFX1030W64-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX1030W64-NEXT:    s_cselect_b32 s13, s15, s13
 ; GFX1030W64-NEXT:    s_cselect_b32 s14, s16, s14
 ; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_subb_u32 s3, s3, s12
+; GFX1030W64-NEXT:    s_subb_u32 s3, s3, s8
 ; GFX1030W64-NEXT:    s_cmp_ge_u32 s3, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s12, s4
 ; GFX1030W64-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX1030W64-NEXT:    s_cmp_eq_u32 s3, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s3, s7, s6
 ; GFX1030W64-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1030W64-NEXT:    s_cselect_b32 s7, s14, s11
-; GFX1030W64-NEXT:    s_cselect_b32 s6, s15, s10
+; GFX1030W64-NEXT:    s_cselect_b32 s6, s13, s10
 ; GFX1030W64-NEXT:    s_cbranch_execnz .LBB16_3
 ; GFX1030W64-NEXT:  .LBB16_2:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_add_u32 s11, s12, s11
 ; GFX11-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX11-NEXT:    s_add_u32 s8, s8, s11
-; GFX11-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX11-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX11-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX11-NEXT:    s_addc_u32 s7, s7, s12
-; GFX11-NEXT:    s_mul_i32 s10, s10, s8
+; GFX11-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX11-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX11-NEXT:    s_mul_i32 s9, s9, s7
-; GFX11-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX11-NEXT:    s_add_i32 s9, s13, s9
-; GFX11-NEXT:    s_mul_hi_u32 s13, s7, s11
+; GFX11-NEXT:    s_mul_i32 s10, s10, s8
+; GFX11-NEXT:    s_add_i32 s9, s11, s9
+; GFX11-NEXT:    s_mul_i32 s11, s7, s12
 ; GFX11-NEXT:    s_add_i32 s9, s9, s10
-; GFX11-NEXT:    s_mul_i32 s10, s7, s11
+; GFX11-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX11-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX11-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX11-NEXT:    s_add_u32 s12, s12, s15
+; GFX11-NEXT:    s_add_u32 s10, s10, s15
+; GFX11-NEXT:    s_mul_hi_u32 s13, s7, s12
 ; GFX11-NEXT:    s_addc_u32 s14, 0, s14
-; GFX11-NEXT:    s_mul_hi_u32 s11, s7, s9
-; GFX11-NEXT:    s_add_u32 s10, s12, s10
+; GFX11-NEXT:    s_mul_hi_u32 s12, s7, s9
+; GFX11-NEXT:    s_add_u32 s10, s10, s11
 ; GFX11-NEXT:    s_mul_i32 s9, s7, s9
 ; GFX11-NEXT:    s_addc_u32 s10, s14, s13
-; GFX11-NEXT:    s_addc_u32 s11, s11, 0
+; GFX11-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX11-NEXT:    s_add_u32 s9, s10, s9
 ; GFX11-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX11-NEXT:    s_add_u32 s8, s8, s9
-; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX11-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX11-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX11-NEXT:    s_addc_u32 s7, s7, s10
-; GFX11-NEXT:    s_mul_i32 s8, s3, s8
+; GFX11-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX11-NEXT:    s_mul_i32 s12, s2, s7
-; GFX11-NEXT:    s_mul_hi_u32 s10, s2, s7
-; GFX11-NEXT:    s_add_u32 s11, s11, s12
-; GFX11-NEXT:    s_addc_u32 s10, 0, s10
+; GFX11-NEXT:    s_mul_hi_u32 s11, s2, s7
+; GFX11-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX11-NEXT:    s_mul_i32 s8, s3, s8
+; GFX11-NEXT:    s_add_u32 s9, s9, s12
+; GFX11-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX11-NEXT:    s_mul_hi_u32 s13, s3, s7
-; GFX11-NEXT:    s_add_u32 s8, s11, s8
+; GFX11-NEXT:    s_add_u32 s8, s9, s8
 ; GFX11-NEXT:    s_mul_i32 s7, s3, s7
-; GFX11-NEXT:    s_addc_u32 s8, s10, s9
+; GFX11-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX11-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX11-NEXT:    s_add_u32 s7, s8, s7
 ; GFX11-NEXT:    s_addc_u32 s8, 0, s9
@@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_add_i32 s9, s9, s10
 ; GFX11-NEXT:    s_mul_i32 s10, s4, s7
 ; GFX11-NEXT:    s_add_i32 s9, s9, s11
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX11-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX11-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_subb_u32 s11, s11, s5
 ; GFX11-NEXT:    s_sub_u32 s13, s10, s4
-; GFX11-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX11-NEXT:    s_subb_u32 s11, s11, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_ge_u32 s11, s5
 ; GFX11-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX11-NEXT:    s_cmp_ge_u32 s13, s4
@@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
-; GFX1250-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX1250-NEXT:    s_cbranch_scc0 .LBB16_4
 ; GFX1250-NEXT:  ; %bb.1:
 ; GFX1250-NEXT:    s_cvt_f32_u32 s4, s6
@@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[4:5], s[12:13]
 ; GFX1250-NEXT:    s_add_co_u32 s8, s8, s12
-; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s9, s9, s13
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_u64 s[10:11], s[10:11], s[8:9]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s8, s11
 ; GFX1250-NEXT:    s_mul_i32 s12, s8, s11
 ; GFX1250-NEXT:    s_mul_hi_u32 s4, s8, s10
@@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[10:11], s[4:5], s[10:11]
 ; GFX1250-NEXT:    s_add_co_u32 s8, s8, s10
-; GFX1250-NEXT:    s_cselect_b32 s10, -1, 0
-; GFX1250-NEXT:    s_mul_hi_u32 s4, s2, s8
-; GFX1250-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX1250-NEXT:    s_mul_hi_u32 s12, s3, s8
 ; GFX1250-NEXT:    s_add_co_ci_u32 s10, s9, s11
-; GFX1250-NEXT:    s_mul_i32 s11, s3, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s4, s2, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s11, s3, s8
+; GFX1250-NEXT:    s_mul_i32 s12, s3, s8
 ; GFX1250-NEXT:    s_mul_hi_u32 s9, s2, s10
 ; GFX1250-NEXT:    s_mul_i32 s8, s2, s10
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s3, s10
 ; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], s[8:9]
 ; GFX1250-NEXT:    s_mul_i32 s10, s3, s10
-; GFX1250-NEXT:    s_add_co_u32 s4, s8, s11
-; GFX1250-NEXT:    s_add_co_ci_u32 s4, s9, s12
+; GFX1250-NEXT:    s_add_co_u32 s4, s8, s12
+; GFX1250-NEXT:    s_add_co_ci_u32 s4, s9, s11
 ; GFX1250-NEXT:    s_add_co_ci_u32 s11, s13, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], s[10:11]
@@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, s7
 ; GFX1250-NEXT:    s_sub_co_u32 s13, s4, s6
-; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, 0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_cmp_ge_u32 s12, s7
 ; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX1250-NEXT:    s_cmp_ge_u32 s13, s6
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 4b151b9..07e6a76 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    s_lshl_b32 s2, s2, 8
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    s_lshl_b32 s3, s2, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_flbit_i32_b32 s3, s3
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_cselect_b32 s2, s3, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index cefcbdd..fca57be 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s4, s6, 16
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB14_4
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
@@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s6, 16
-; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB14_4
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_mov_b32 s11, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index d8a5e7fa..dbdea8e 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() {
 ; GFX7-NEXT:    s_add_u32 s7, s6, s6
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
-; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX7-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() {
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s7, s6, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() {
 ; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s5, s4, s4
-; GFX10-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX10-NEXT:    s_and_b32 s7, s7, exec_lo
@@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() {
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s1, s0, s0
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s3, s3, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s2, s2, 0
 ; GFX11-NEXT:    s_cmp_gt_u32 s0, 31
 ; GFX11-NEXT:    s_cselect_b32 s0, s1, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-NEXT:    s_add_u32 s0, s2, s2
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX7-NEXT:    s_addc_u32 s0, s2, 0
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
@@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ;
 ; GFX9-LABEL: s_add_co_br_user:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s2, s2
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s2, 0
+; GFX9-NEXT:    s_add_u32 s1, s0, s0
+; GFX9-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB1_2
@@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s1, s0, s0
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s0
@@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s1, s0, s0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index 13206ad..f45070c 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
 
-; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee  -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
-; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
+; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee  < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
+; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
 
 ; FIXME: This should also fold when fma is actually fast if an FMA
 ; exists in the original program.
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 62847b1..9a17538 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s3, s0
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SI-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; VI-NEXT:    s_or_b32 s0, s3, s0
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; VI-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s0, s3, s0
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-TRUE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s1, 8
@@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s1, s1, 8
@@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; SI-NEXT:    s_and_b32 s6, s4, 0xffe
 ; SI-NEXT:    s_and_b32 s4, s1, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s4, s0
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s5
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
@@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; SI-NEXT:    s_and_b32 s5, s0, 0xffe
 ; SI-NEXT:    s_and_b32 s0, s3, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s0, s2
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; SI-NEXT:    v_readfirstlane_b32 s0, v2
@@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_lshr_b32 s5, s3, 8
-; VI-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffe
+; VI-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; VI-NEXT:    s_or_b32 s2, s6, s2
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
 ; VI-NEXT:    s_bfe_u32 s3, s3, 0xb0014
@@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-NEXT:    s_and_b32 s7, s2, 0xffe
 ; VI-NEXT:    s_and_b32 s2, s1, 0x1ff
 ; VI-NEXT:    s_or_b32 s0, s2, s0
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; VI-NEXT:    s_bfe_u32 s1, s1, 0xb0014
@@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX9-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX9-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX9-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s2, s6, s2
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
 ; GFX9-NEXT:    s_bfe_u32 s6, s3, 0xb0014
@@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-NEXT:    s_and_b32 s6, s2, 0xffe
 ; GFX9-NEXT:    s_and_b32 s2, s1, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s0, s2, s0
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ;
 ; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s5, s3, 0x1ff
-; GFX11-NEXT:    s_lshr_b32 s6, s3, 8
-; GFX11-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-NEXT:    s_and_b32 s5, s6, 0xffe
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
@@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX11-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-NEXT:    s_cselect_b32 s2, s5, s6
 ; GFX11-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-NEXT:    s_and_b32 s6, s1, 0x1ff
 ; GFX11-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX11-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-NEXT:    s_or_b32 s0, s6, s0
+; GFX11-NEXT:    s_and_b32 s6, s1, 0x1ff
 ; GFX11-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_or_b32 s0, s6, s0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index d41e2c6..8df7564 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
 
 define amdgpu_kernel void @fpext_f16_to_f32(
 ; SI-LABEL: fpext_f16_to_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index a43292d..a043d53 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 
 define amdgpu_kernel void @fptosi_f16_to_i16(
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 96cb621..af1ab37 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 
 define amdgpu_kernel void @fptoui_f16_to_i16(
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index b0dd187..c28b25c7 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; SI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s6, s6, 0xffe
 ; SI-GISEL-NEXT:    s_or_b32 s4, s7, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s6, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s6, s6, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s7, s3, 12
@@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; VI-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX9-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX950-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s8, s8, 0xffe
 ; SI-GISEL-NEXT:    s_or_b32 s4, s9, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s8, s8, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s9, s3, 12
@@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-GISEL-NEXT:    s_and_b32 s4, s4, 0x8000
 ; SI-GISEL-NEXT:    s_addk_i32 s5, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s8, s8, 0xffe
-; SI-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; SI-GISEL-NEXT:    s_or_b32 s3, s4, s3
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; SI-GISEL-NEXT:    s_or_b32 s4, s9, s6
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s6, s6, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s8, s5, 12
@@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; VI-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; VI-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; VI-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; VI-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; VI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; VI-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; VI-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; GFX9-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; GFX9-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; GFX9-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX9-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX9-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX950-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; GFX950-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; GFX950-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; GFX950-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX950-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX950-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX950-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s8, 1, s2
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s8, 1, s2
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s2, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s8, 1, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_eq_u32 s2, 0x40f
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s2, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s8, 1, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_eq_u32 s2, 0x40f
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s9, s3, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 5d31177..b6b26a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -2,14 +2,14 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
 
@@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; SI-NEXT:    s_and_b32 s1, s7, 0x1ff
 ; SI-NEXT:    s_and_b32 s8, s0, 0xffe
 ; SI-NEXT:    s_or_b32 s0, s1, s6
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
@@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SDAG-NEXT:    s_and_b32 s8, s4, 0xffe
 ; VI-SDAG-NEXT:    s_and_b32 s4, s7, 0x1ff
 ; VI-SDAG-NEXT:    s_or_b32 s4, s4, s6
-; VI-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s1, s5
 ; VI-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_and_b32 s4, s3, 0x1ff
-; GFX10-SDAG-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX10-SDAG-NEXT:    s_or_b32 s2, s4, s2
-; GFX10-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
-; GFX10-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-SDAG-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX10-SDAG-NEXT:    s_and_b32 s5, s3, 0x1ff
+; GFX10-SDAG-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX10-SDAG-NEXT:    s_or_b32 s2, s5, s2
 ; GFX10-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX10-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
@@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX10-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX10-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX10-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX10-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX10-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT:    s_or_b32 s2, s6, s2
 ; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-GISEL-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX10-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_and_b32 s4, s3, 0x1ff
-; GFX11-SDAG-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-SDAG-NEXT:    s_or_b32 s2, s4, s2
-; GFX11-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX11-SDAG-NEXT:    s_and_b32 s5, s3, 0x1ff
+; GFX11-SDAG-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX11-SDAG-NEXT:    s_or_b32 s2, s5, s2
 ; GFX11-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
@@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fract.f64.ll b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
index f09c1c6..cc2e78d 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
@@ -2,8 +2,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
 
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
 
 declare double @llvm.fabs.f64(double) #0
 declare double @llvm.floor.f64(double) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fract.ll b/llvm/test/CodeGen/AMDGPU/fract.ll
index 8ef0fcf..723fd93 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.ll
@@ -1,8 +1,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s
 
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.floor.f32(float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 37756d1..31f277f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6351bb3..4581efc 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a9ac008..bd570d9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 6311143..1f2d70c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
index 4ae0ba0..4e93eca 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-unsafe-fp-math %s  | FileCheck --check-prefixes=GCN,UNSAFE %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 %s  | FileCheck --check-prefixes=GCN,UNSAFE %s
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-nans-fp-math %s | FileCheck --check-prefixes=GCN,NONANS %s
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-infs-fp-math %s | FileCheck --check-prefixes=GCN,NOINFS %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index eee232a..c3f3917 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -136,19 +136,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:  .LBB2_6: ; %bb18
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT:    s_and_b32 s1, s8, s1
-; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT:    s_and_b32 s13, s8, s13
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 s13, s13, exec_lo
 ; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-NEXT:    s_cselect_b32 s1, s19, s13
-; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
+; GFX11-NEXT:    s_cselect_b32 s1, s19, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s1, s1, 1
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
 ; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
 ; GFX11-NEXT:    s_and_b32 s20, s9, exec_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 8748aff..6dc9199 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s3
-; GFX12-NEXT:    s_lshl_b32 s7, 1, s3
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX12-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_and_not1_b32 s1, s1, s3
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
@@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX942-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX942-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX942-NEXT:    s_mov_b32 m0, s3
+; GFX942-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX942-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX942-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX11-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX11-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, s7
 ; GFX11-NEXT:    v_writelane_b32 v0, s3, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX11-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX10-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX10-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s7
 ; GFX10-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX10-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX10-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX90A-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX90A-NEXT:    s_mov_b32 m0, s3
+; GFX90A-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX90A-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX90A-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX908-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX908-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX908-NEXT:    s_mov_b32 m0, s3
+; GFX908-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX908-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX908-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX908-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX908-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX8-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX8-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX8-NEXT:    s_mov_b32 m0, s3
+; GFX8-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX8-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX8-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s3
-; GFX12-NEXT:    s_lshl_b32 s7, 1, s3
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX12-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_and_not1_b32 s1, s1, s3
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
@@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX942-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX942-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX942-NEXT:    s_mov_b32 m0, s3
+; GFX942-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX942-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX942-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX11-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, s7
 ; GFX11-NEXT:    v_writelane_b32 v0, s3, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX11-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX10-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX10-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s7
 ; GFX10-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX10-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX10-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX90A-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX90A-NEXT:    s_mov_b32 m0, s3
+; GFX90A-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX90A-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX90A-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX908-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX908-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX908-NEXT:    s_mov_b32 m0, s3
+; GFX908-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX908-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX908-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX908-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX908-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX8-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX8-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX8-NEXT:    s_mov_b32 m0, s3
+; GFX8-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX8-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX8-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index c1cf06e..fba42c4 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -388,9 +388,8 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
-  ; GCN-NEXT:   S_NOP 0, implicit killed $scc
-  ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
+  ; GCN-NEXT:   S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
   ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
@@ -417,6 +416,80 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            xor_1_cmp_lg_0_killed_scc
+body:             |
+  ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0, $vgpr0_vgpr1
+
+    %0:sreg_32 = COPY $sgpr0
+    %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc
+    S_NOP 0, implicit killed $scc
+    S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+---
+name:            absdiff_1_cmp_lg_0_killed_scc
+body:             |
+  ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0, $vgpr0_vgpr1
+
+    %0:sreg_32 = COPY $sgpr0
+    %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc
+    S_NOP 0, implicit killed $scc
+    S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
 
 ---
 name:            and_1_cmp_eq_1_clobbered_scc
@@ -2070,8 +2143,7 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc
   ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
index ef3e04c..6ce614b 100644
--- a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=fast -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=FP-CONTRACT-FAST %s
-; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off --enable-unsafe-fp-math -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s
+; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s
 ; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=NO-UNSAFE-FP-MATH %s
 
 define double @is_profitable_f64_contract(ptr dereferenceable(8) %ptr_x, ptr dereferenceable(8) %ptr_y, ptr dereferenceable(8) %ptr_a) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index f53aaaa..dd5f838 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
 
 declare i32 @llvm.ctpop.i32(i32)
 declare i64 @llvm.ctpop.i64(i64)
@@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: shl32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: shl64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: lshr32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshr_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: lshr64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: ashr32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_ashr_i32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: ashr64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
 ; CHECK-LABEL: abs32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_abs_i32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: and32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: and64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_or_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: or64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: xor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: xor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: nand32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nand_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nand64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nand_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: nor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: xnor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xnor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: xnor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xnor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: andn232:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_andn2_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nandn264:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: orn232:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_orn2_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: orn264:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_orn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
 ; CHECK-LABEL: bfe_i32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bfe_i32 s0, s0, 0x80010
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
 ; CHECK-LABEL: bfe_u32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
 ; CHECK-LABEL: bcnt132:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
 ; CHECK-LABEL: quadmask32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_quadmask_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
 ; CHECK-LABEL: quadmask64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) {
 ; CHECK-LABEL: not32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_not_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
 ; CHECK-LABEL: not64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_not_b64 s[0:1], s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
   %zext = zext i1 %cmp to i32
   ret i32 %zext
 }
+
+
+; --------------------------------------------------------------------------------
+; Negative tests
+; --------------------------------------------------------------------------------
+
+@1 = extern_weak dso_local addrspace(4) constant i32
+
+define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
+; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_getpc_b64 s[0:1]
+; CHECK-NEXT:    s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB35_2
+; CHECK-NEXT:  ; %bb.1: ; %endif
+; CHECK-NEXT:    s_mov_b32 s0, 1
+; CHECK-NEXT:    s_branch .LBB35_3
+; CHECK-NEXT:  .LBB35_2: ; %if
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_branch .LBB35_3
+; CHECK-NEXT:  .LBB35_3:
+  %cmp = icmp ne ptr addrspace(4) @1, null
+  br i1 %cmp, label %endif, label %if
+
+if:
+  ret i32 0
+
+endif:
+  ret i32 1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
index a828ee0..7552f6b 100644
--- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
@@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) {
 ; CHECK-LABEL: s_uaddo_pseudo:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_addc_u32 s0, 1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1)
@@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: s_usubo_pseudo:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s0, s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 5f6d622..71f5a94 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_addc_u32 s15, 0, s16
 ; GCN-NEXT:    s_add_u32 s16, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s14, s14, s15
 ; GCN-NEXT:    s_mul_i32 s0, s12, s14
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_add_u32 s15, s16, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s14, s14, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s7, 31
 ; GCN-NEXT:    s_add_u32 s0, s6, s12
@@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_addc_u32 s4, s4, 0
 ; GCN-NEXT:    s_mul_i32 s14, s7, s14
-; GCN-NEXT:    s_add_u32 s14, s1, s14
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    s_add_u32 s16, s1, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_addc_u32 s15, 0, s4
+; GCN-NEXT:    s_addc_u32 s17, 0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_mul_i32 s4, s10, s15
+; GCN-NEXT:    s_mul_i32 s4, s10, s17
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s5, s11, s14
-; GCN-NEXT:    s_add_i32 s16, s4, s5
-; GCN-NEXT:    s_sub_i32 s17, s7, s16
-; GCN-NEXT:    s_mul_i32 s4, s10, s14
+; GCN-NEXT:    s_mul_i32 s5, s11, s16
+; GCN-NEXT:    s_add_i32 s18, s4, s5
+; GCN-NEXT:    s_sub_i32 s14, s7, s18
+; GCN-NEXT:    s_mul_i32 s4, s10, s16
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s18, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_subb_u32 s17, s17, s11
-; GCN-NEXT:    s_sub_u32 s19, s6, s10
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s4, s5
+; GCN-NEXT:    s_subb_u32 s19, s14, s11
+; GCN-NEXT:    s_sub_u32 s20, s6, s10
+; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s14, s15
+; GCN-NEXT:    s_subb_u32 s14, s19, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s11
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s20, s10
+; GCN-NEXT:    s_cselect_b32 s19, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s11
+; GCN-NEXT:    s_cselect_b32 s14, s19, s15
+; GCN-NEXT:    s_add_u32 s15, s16, 1
+; GCN-NEXT:    s_addc_u32 s19, s17, 0
+; GCN-NEXT:    s_add_u32 s20, s16, 2
+; GCN-NEXT:    s_addc_u32 s21, s17, 0
+; GCN-NEXT:    s_cmp_lg_u32 s14, 0
+; GCN-NEXT:    s_cselect_b32 s14, s20, s15
+; GCN-NEXT:    s_cselect_b32 s15, s21, s19
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s17, 0
+; GCN-NEXT:    s_subb_u32 s4, s7, s18
 ; GCN-NEXT:    s_cmp_ge_u32 s4, s11
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s19, s10
-; GCN-NEXT:    s_cselect_b32 s17, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s4, s11
-; GCN-NEXT:    s_cselect_b32 s4, s17, s5
-; GCN-NEXT:    s_add_u32 s5, s14, 1
-; GCN-NEXT:    s_addc_u32 s17, s15, 0
-; GCN-NEXT:    s_add_u32 s19, s14, 2
-; GCN-NEXT:    s_addc_u32 s20, s15, 0
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s19, s5
-; GCN-NEXT:    s_cselect_b32 s5, s20, s17
-; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s16
-; GCN-NEXT:    s_cmp_ge_u32 s7, s11
-; GCN-NEXT:    s_cselect_b32 s16, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s10
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s11
-; GCN-NEXT:    s_cselect_b32 s6, s6, s16
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s5, s5, s15
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
+; GCN-NEXT:    s_cmp_eq_u32 s4, s11
+; GCN-NEXT:    s_cselect_b32 s4, s6, s5
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b32 s5, s15, s17
+; GCN-NEXT:    s_cselect_b32 s4, s14, s16
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s6
@@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s10, 0
 ; GCN-IR-NEXT:    s_addc_u32 s10, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
@@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
-; GCN-IR-NEXT:    s_cmp_lg_u32 s20, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[8:9]
@@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s12, 0, s13
 ; GCN-NEXT:    s_add_u32 s13, s8, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s11, s11, s12
 ; GCN-NEXT:    s_mul_i32 s8, s2, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
@@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s2, s13, s2
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s8, s11, s10
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s8, 24
@@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
 ; GCN-NEXT:    s_add_u32 s8, s10, s8
-; GCN-NEXT:    s_addc_u32 s10, 0, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s9
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_mul_i32 s8, s7, s10
+; GCN-NEXT:    s_mul_i32 s8, s7, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s11, s9, s8
-; GCN-NEXT:    s_sub_i32 s12, 0, s11
-; GCN-NEXT:    s_mul_i32 s8, s6, s10
-; GCN-NEXT:    s_sub_u32 s13, 24, s8
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s12, s12, s7
-; GCN-NEXT:    s_sub_u32 s15, s13, s6
+; GCN-NEXT:    s_add_i32 s13, s9, s8
+; GCN-NEXT:    s_sub_i32 s10, 0, s13
+; GCN-NEXT:    s_mul_i32 s8, s6, s12
+; GCN-NEXT:    s_sub_u32 s14, 24, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s8, s9
+; GCN-NEXT:    s_subb_u32 s15, s10, s7
+; GCN-NEXT:    s_sub_u32 s16, s14, s6
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s15, 0
+; GCN-NEXT:    s_cmp_ge_u32 s10, s7
+; GCN-NEXT:    s_cselect_b32 s11, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s16, s6
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s10, s7
+; GCN-NEXT:    s_cselect_b32 s10, s15, s11
+; GCN-NEXT:    s_add_u32 s11, s12, 1
+; GCN-NEXT:    s_addc_u32 s15, 0, 0
+; GCN-NEXT:    s_add_u32 s16, s12, 2
+; GCN-NEXT:    s_addc_u32 s17, 0, 0
+; GCN-NEXT:    s_cmp_lg_u32 s10, 0
+; GCN-NEXT:    s_cselect_b32 s10, s16, s11
+; GCN-NEXT:    s_cselect_b32 s11, s17, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_subb_u32 s8, 0, s13
 ; GCN-NEXT:    s_cmp_ge_u32 s8, s7
 ; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s15, s6
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s7
-; GCN-NEXT:    s_cselect_b32 s8, s12, s9
-; GCN-NEXT:    s_add_u32 s9, s10, 1
-; GCN-NEXT:    s_addc_u32 s12, 0, 0
-; GCN-NEXT:    s_add_u32 s15, s10, 2
-; GCN-NEXT:    s_addc_u32 s16, 0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s15, s9
-; GCN-NEXT:    s_cselect_b32 s9, s16, s12
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s11, 0, s11
-; GCN-NEXT:    s_cmp_ge_u32 s11, s7
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s6
+; GCN-NEXT:    s_cmp_ge_u32 s14, s6
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s11, s7
-; GCN-NEXT:    s_cselect_b32 s6, s6, s12
+; GCN-NEXT:    s_cmp_eq_u32 s8, s7
+; GCN-NEXT:    s_cselect_b32 s6, s6, s9
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s7, s9, 0
-; GCN-NEXT:    s_cselect_b32 s6, s8, s10
+; GCN-NEXT:    s_cselect_b32 s7, s11, 0
+; GCN-NEXT:    s_cselect_b32 s6, s10, s12
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_subb_u32 s7, s7, s4
@@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
@@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 09596e9..7ddd90e 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 define amdgpu_kernel void @sitofp_i16_to_f16(
 ; SI-LABEL: sitofp_i16_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index bbd1793..e12e31b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GCN-NEXT:    s_sub_u32 s3, 0, s8
-; GCN-NEXT:    s_subb_u32 s12, 0, s9
+; GCN-NEXT:    s_subb_u32 s10, 0, s9
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s13, v1
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s11, s3, s13
-; GCN-NEXT:    s_mul_hi_u32 s15, s3, s10
-; GCN-NEXT:    s_mul_i32 s14, s12, s10
-; GCN-NEXT:    s_add_i32 s11, s15, s11
-; GCN-NEXT:    s_add_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s16, s3, s10
-; GCN-NEXT:    s_mul_i32 s15, s10, s11
-; GCN-NEXT:    s_mul_hi_u32 s17, s10, s16
-; GCN-NEXT:    s_mul_hi_u32 s14, s10, s11
+; GCN-NEXT:    v_readfirstlane_b32 s11, v1
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s13, s3, s11
+; GCN-NEXT:    s_mul_hi_u32 s15, s3, s12
+; GCN-NEXT:    s_mul_i32 s14, s10, s12
+; GCN-NEXT:    s_add_i32 s13, s15, s13
+; GCN-NEXT:    s_add_i32 s13, s13, s14
+; GCN-NEXT:    s_mul_i32 s16, s3, s12
+; GCN-NEXT:    s_mul_i32 s15, s12, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s12, s16
+; GCN-NEXT:    s_mul_hi_u32 s14, s12, s13
 ; GCN-NEXT:    s_add_u32 s15, s17, s15
 ; GCN-NEXT:    s_addc_u32 s14, 0, s14
-; GCN-NEXT:    s_mul_hi_u32 s18, s13, s16
-; GCN-NEXT:    s_mul_i32 s16, s13, s16
+; GCN-NEXT:    s_mul_hi_u32 s18, s11, s16
+; GCN-NEXT:    s_mul_i32 s16, s11, s16
 ; GCN-NEXT:    s_add_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_hi_u32 s17, s13, s11
+; GCN-NEXT:    s_mul_hi_u32 s17, s11, s13
 ; GCN-NEXT:    s_addc_u32 s14, s14, s18
 ; GCN-NEXT:    s_addc_u32 s15, s17, 0
-; GCN-NEXT:    s_mul_i32 s11, s13, s11
-; GCN-NEXT:    s_add_u32 s11, s14, s11
+; GCN-NEXT:    s_mul_i32 s13, s11, s13
+; GCN-NEXT:    s_add_u32 s13, s14, s13
 ; GCN-NEXT:    s_addc_u32 s14, 0, s15
-; GCN-NEXT:    s_add_u32 s15, s10, s11
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT:    s_addc_u32 s13, s13, s14
-; GCN-NEXT:    s_mul_i32 s10, s3, s13
-; GCN-NEXT:    s_mul_hi_u32 s11, s3, s15
-; GCN-NEXT:    s_add_i32 s10, s11, s10
-; GCN-NEXT:    s_mul_i32 s12, s12, s15
-; GCN-NEXT:    s_add_i32 s10, s10, s12
-; GCN-NEXT:    s_mul_i32 s3, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s12, s13, s3
-; GCN-NEXT:    s_mul_i32 s14, s13, s3
-; GCN-NEXT:    s_mul_i32 s17, s15, s10
-; GCN-NEXT:    s_mul_hi_u32 s3, s15, s3
-; GCN-NEXT:    s_mul_hi_u32 s16, s15, s10
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s11, s11, s14
+; GCN-NEXT:    s_mul_i32 s13, s3, s11
+; GCN-NEXT:    s_mul_hi_u32 s14, s3, s12
+; GCN-NEXT:    s_add_i32 s13, s14, s13
+; GCN-NEXT:    s_mul_i32 s10, s10, s12
+; GCN-NEXT:    s_add_i32 s13, s13, s10
+; GCN-NEXT:    s_mul_i32 s3, s3, s12
+; GCN-NEXT:    s_mul_hi_u32 s14, s11, s3
+; GCN-NEXT:    s_mul_i32 s15, s11, s3
+; GCN-NEXT:    s_mul_i32 s17, s12, s13
+; GCN-NEXT:    s_mul_hi_u32 s3, s12, s3
+; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
 ; GCN-NEXT:    s_add_u32 s3, s3, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_add_u32 s3, s3, s14
-; GCN-NEXT:    s_mul_hi_u32 s11, s13, s10
-; GCN-NEXT:    s_addc_u32 s3, s16, s12
-; GCN-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-NEXT:    s_mul_i32 s10, s13, s10
-; GCN-NEXT:    s_add_u32 s3, s3, s10
-; GCN-NEXT:    s_addc_u32 s12, 0, s11
-; GCN-NEXT:    s_add_u32 s3, s15, s3
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT:    s_addc_u32 s14, s13, s12
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_mul_hi_u32 s10, s11, s13
+; GCN-NEXT:    s_addc_u32 s3, s16, s14
+; GCN-NEXT:    s_addc_u32 s10, s10, 0
+; GCN-NEXT:    s_mul_i32 s13, s11, s13
+; GCN-NEXT:    s_add_u32 s3, s3, s13
+; GCN-NEXT:    s_addc_u32 s10, 0, s10
+; GCN-NEXT:    s_add_u32 s3, s12, s3
+; GCN-NEXT:    s_addc_u32 s14, s11, s10
 ; GCN-NEXT:    s_ashr_i32 s10, s5, 31
 ; GCN-NEXT:    s_add_u32 s12, s4, s10
 ; GCN-NEXT:    s_mov_b32 s11, s10
@@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_mul_i32 s3, s8, s3
 ; GCN-NEXT:    s_sub_u32 s3, s12, s3
 ; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GCN-NEXT:    s_subb_u32 s12, s16, s9
 ; GCN-NEXT:    s_sub_u32 s18, s3, s8
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s19, s12, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s19, s9
 ; GCN-NEXT:    s_cselect_b32 s20, -1, 0
@@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_cselect_b32 s20, s21, s20
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s12, s12, s9
-; GCN-NEXT:    s_sub_u32 s21, s18, s8
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT:    s_sub_u32 s16, s18, s8
 ; GCN-NEXT:    s_subb_u32 s12, s12, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s20, 0
-; GCN-NEXT:    s_cselect_b32 s16, s21, s18
+; GCN-NEXT:    s_cselect_b32 s16, s16, s18
 ; GCN-NEXT:    s_cselect_b32 s12, s12, s19
 ; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GCN-NEXT:    s_subb_u32 s5, s13, s5
@@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v0
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s3, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s3, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s3, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s3, s3, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s3, s3, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s5, s13, s5
@@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s9, 0, s6
-; GCN-NEXT:    s_subb_u32 s16, 0, s7
+; GCN-NEXT:    s_subb_u32 s14, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s17, v1
-; GCN-NEXT:    v_readfirstlane_b32 s14, v0
-; GCN-NEXT:    s_mul_i32 s15, s9, s17
-; GCN-NEXT:    s_mul_hi_u32 s19, s9, s14
-; GCN-NEXT:    s_mul_i32 s18, s16, s14
-; GCN-NEXT:    s_add_i32 s15, s19, s15
-; GCN-NEXT:    s_add_i32 s15, s15, s18
-; GCN-NEXT:    s_mul_i32 s20, s9, s14
-; GCN-NEXT:    s_mul_i32 s19, s14, s15
-; GCN-NEXT:    s_mul_hi_u32 s21, s14, s20
-; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT:    v_readfirstlane_b32 s15, v1
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    s_mul_i32 s17, s9, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s9, s16
+; GCN-NEXT:    s_mul_i32 s18, s14, s16
+; GCN-NEXT:    s_add_i32 s17, s19, s17
+; GCN-NEXT:    s_add_i32 s17, s17, s18
+; GCN-NEXT:    s_mul_i32 s20, s9, s16
+; GCN-NEXT:    s_mul_i32 s19, s16, s17
+; GCN-NEXT:    s_mul_hi_u32 s21, s16, s20
+; GCN-NEXT:    s_mul_hi_u32 s18, s16, s17
 ; GCN-NEXT:    s_add_u32 s19, s21, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_mul_hi_u32 s22, s17, s20
-; GCN-NEXT:    s_mul_i32 s20, s17, s20
+; GCN-NEXT:    s_mul_hi_u32 s22, s15, s20
+; GCN-NEXT:    s_mul_i32 s20, s15, s20
 ; GCN-NEXT:    s_add_u32 s19, s19, s20
-; GCN-NEXT:    s_mul_hi_u32 s21, s17, s15
+; GCN-NEXT:    s_mul_hi_u32 s21, s15, s17
 ; GCN-NEXT:    s_addc_u32 s18, s18, s22
 ; GCN-NEXT:    s_addc_u32 s19, s21, 0
-; GCN-NEXT:    s_mul_i32 s15, s17, s15
-; GCN-NEXT:    s_add_u32 s15, s18, s15
+; GCN-NEXT:    s_mul_i32 s17, s15, s17
+; GCN-NEXT:    s_add_u32 s17, s18, s17
 ; GCN-NEXT:    s_addc_u32 s18, 0, s19
-; GCN-NEXT:    s_add_u32 s19, s14, s15
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT:    s_addc_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_i32 s14, s9, s17
-; GCN-NEXT:    s_mul_hi_u32 s15, s9, s19
-; GCN-NEXT:    s_add_i32 s14, s15, s14
-; GCN-NEXT:    s_mul_i32 s16, s16, s19
-; GCN-NEXT:    s_add_i32 s14, s14, s16
-; GCN-NEXT:    s_mul_i32 s9, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s16, s17, s9
-; GCN-NEXT:    s_mul_i32 s18, s17, s9
-; GCN-NEXT:    s_mul_i32 s21, s19, s14
-; GCN-NEXT:    s_mul_hi_u32 s9, s19, s9
-; GCN-NEXT:    s_mul_hi_u32 s20, s19, s14
+; GCN-NEXT:    s_add_u32 s16, s16, s17
+; GCN-NEXT:    s_addc_u32 s15, s15, s18
+; GCN-NEXT:    s_mul_i32 s17, s9, s15
+; GCN-NEXT:    s_mul_hi_u32 s18, s9, s16
+; GCN-NEXT:    s_add_i32 s17, s18, s17
+; GCN-NEXT:    s_mul_i32 s14, s14, s16
+; GCN-NEXT:    s_add_i32 s17, s17, s14
+; GCN-NEXT:    s_mul_i32 s9, s9, s16
+; GCN-NEXT:    s_mul_hi_u32 s18, s15, s9
+; GCN-NEXT:    s_mul_i32 s19, s15, s9
+; GCN-NEXT:    s_mul_i32 s21, s16, s17
+; GCN-NEXT:    s_mul_hi_u32 s9, s16, s9
+; GCN-NEXT:    s_mul_hi_u32 s20, s16, s17
 ; GCN-NEXT:    s_add_u32 s9, s9, s21
 ; GCN-NEXT:    s_addc_u32 s20, 0, s20
-; GCN-NEXT:    s_add_u32 s9, s9, s18
-; GCN-NEXT:    s_mul_hi_u32 s15, s17, s14
-; GCN-NEXT:    s_addc_u32 s9, s20, s16
-; GCN-NEXT:    s_addc_u32 s15, s15, 0
-; GCN-NEXT:    s_mul_i32 s14, s17, s14
-; GCN-NEXT:    s_add_u32 s9, s9, s14
-; GCN-NEXT:    s_addc_u32 s16, 0, s15
-; GCN-NEXT:    s_add_u32 s9, s19, s9
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT:    s_addc_u32 s18, s17, s16
+; GCN-NEXT:    s_add_u32 s9, s9, s19
+; GCN-NEXT:    s_mul_hi_u32 s14, s15, s17
+; GCN-NEXT:    s_addc_u32 s9, s20, s18
+; GCN-NEXT:    s_addc_u32 s14, s14, 0
+; GCN-NEXT:    s_mul_i32 s17, s15, s17
+; GCN-NEXT:    s_add_u32 s9, s9, s17
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    s_add_u32 s9, s16, s9
+; GCN-NEXT:    s_addc_u32 s18, s15, s14
 ; GCN-NEXT:    s_ashr_i32 s14, s11, 31
 ; GCN-NEXT:    s_add_u32 s16, s10, s14
 ; GCN-NEXT:    s_mov_b32 s15, s14
@@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s9, s6, s9
 ; GCN-NEXT:    s_sub_u32 s9, s16, s9
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s16, s20, s7
 ; GCN-NEXT:    s_sub_u32 s22, s9, s6
 ; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s23, s16, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s23, s7
 ; GCN-NEXT:    s_cselect_b32 s24, -1, 0
@@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s24, s25, s24
 ; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s16, s16, s7
-; GCN-NEXT:    s_sub_u32 s25, s22, s6
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT:    s_sub_u32 s20, s22, s6
 ; GCN-NEXT:    s_subb_u32 s16, s16, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s24, 0
-; GCN-NEXT:    s_cselect_b32 s20, s25, s22
+; GCN-NEXT:    s_cselect_b32 s20, s20, s22
 ; GCN-NEXT:    s_cselect_b32 s16, s16, s23
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s11, s17, s11
@@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GCN-NEXT:    s_sub_u32 s3, 0, s10
-; GCN-NEXT:    s_subb_u32 s14, 0, s11
+; GCN-NEXT:    s_subb_u32 s12, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v1
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s13, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT:    s_mul_i32 s16, s14, s12
-; GCN-NEXT:    s_add_i32 s13, s17, s13
-; GCN-NEXT:    s_add_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s18, s3, s12
-; GCN-NEXT:    s_mul_i32 s17, s12, s13
-; GCN-NEXT:    s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT:    s_mul_i32 s16, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s17, s15
+; GCN-NEXT:    s_add_i32 s15, s15, s16
+; GCN-NEXT:    s_mul_i32 s18, s3, s14
+; GCN-NEXT:    s_mul_i32 s17, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT:    s_mul_hi_u32 s16, s14, s15
 ; GCN-NEXT:    s_add_u32 s17, s19, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT:    s_mul_i32 s18, s15, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT:    s_mul_i32 s18, s13, s18
 ; GCN-NEXT:    s_add_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT:    s_mul_hi_u32 s19, s13, s15
 ; GCN-NEXT:    s_addc_u32 s16, s16, s20
 ; GCN-NEXT:    s_addc_u32 s17, s19, 0
-; GCN-NEXT:    s_mul_i32 s13, s15, s13
-; GCN-NEXT:    s_add_u32 s13, s16, s13
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s15, s16, s15
 ; GCN-NEXT:    s_addc_u32 s16, 0, s17
-; GCN-NEXT:    s_add_u32 s17, s12, s13
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_i32 s12, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT:    s_add_i32 s12, s13, s12
-; GCN-NEXT:    s_mul_i32 s14, s14, s17
-; GCN-NEXT:    s_add_i32 s12, s12, s14
-; GCN-NEXT:    s_mul_i32 s3, s3, s17
-; GCN-NEXT:    s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT:    s_mul_i32 s16, s15, s3
-; GCN-NEXT:    s_mul_i32 s19, s17, s12
-; GCN-NEXT:    s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT:    s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s13, s13, s16
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT:    s_add_i32 s15, s16, s15
+; GCN-NEXT:    s_mul_i32 s12, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s15, s12
+; GCN-NEXT:    s_mul_i32 s3, s3, s14
+; GCN-NEXT:    s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT:    s_mul_i32 s17, s13, s3
+; GCN-NEXT:    s_mul_i32 s19, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
 ; GCN-NEXT:    s_add_u32 s3, s3, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_add_u32 s3, s3, s16
-; GCN-NEXT:    s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT:    s_addc_u32 s3, s18, s14
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s12, s15, s12
-; GCN-NEXT:    s_add_u32 s3, s3, s12
-; GCN-NEXT:    s_addc_u32 s14, 0, s13
-; GCN-NEXT:    s_add_u32 s3, s17, s3
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s16, s15, s14
+; GCN-NEXT:    s_add_u32 s3, s3, s17
+; GCN-NEXT:    s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT:    s_addc_u32 s3, s18, s16
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    s_add_u32 s3, s14, s3
+; GCN-NEXT:    s_addc_u32 s16, s13, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s5, 31
 ; GCN-NEXT:    s_add_u32 s14, s4, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
@@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s3, s10, s3
 ; GCN-NEXT:    s_sub_u32 s3, s14, s3
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s14, s18, s11
 ; GCN-NEXT:    s_sub_u32 s20, s3, s10
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s21, s14, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s21, s11
 ; GCN-NEXT:    s_cselect_b32 s22, -1, 0
@@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s22, s23, s22
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s14, s14, s11
-; GCN-NEXT:    s_sub_u32 s23, s20, s10
-; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT:    s_sub_u32 s18, s20, s10
 ; GCN-NEXT:    s_subb_u32 s14, s14, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s22, 0
-; GCN-NEXT:    s_cselect_b32 s18, s23, s20
+; GCN-NEXT:    s_cselect_b32 s18, s18, s20
 ; GCN-NEXT:    s_cselect_b32 s14, s14, s21
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s5, s15, s5
@@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v0
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s1, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s1, s1, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s1, s1, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s13, s3
@@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s17, 0, s6
-; GCN-NEXT:    s_subb_u32 s24, 0, s7
+; GCN-NEXT:    s_subb_u32 s22, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s25, v1
-; GCN-NEXT:    v_readfirstlane_b32 s22, v0
-; GCN-NEXT:    s_mul_i32 s23, s17, s25
-; GCN-NEXT:    s_mul_hi_u32 s27, s17, s22
-; GCN-NEXT:    s_mul_i32 s26, s24, s22
-; GCN-NEXT:    s_add_i32 s23, s27, s23
-; GCN-NEXT:    s_add_i32 s23, s23, s26
-; GCN-NEXT:    s_mul_i32 s28, s17, s22
-; GCN-NEXT:    s_mul_i32 s27, s22, s23
-; GCN-NEXT:    s_mul_hi_u32 s29, s22, s28
-; GCN-NEXT:    s_mul_hi_u32 s26, s22, s23
+; GCN-NEXT:    v_readfirstlane_b32 s23, v1
+; GCN-NEXT:    v_readfirstlane_b32 s24, v0
+; GCN-NEXT:    s_mul_i32 s25, s17, s23
+; GCN-NEXT:    s_mul_hi_u32 s27, s17, s24
+; GCN-NEXT:    s_mul_i32 s26, s22, s24
+; GCN-NEXT:    s_add_i32 s25, s27, s25
+; GCN-NEXT:    s_add_i32 s25, s25, s26
+; GCN-NEXT:    s_mul_i32 s28, s17, s24
+; GCN-NEXT:    s_mul_i32 s27, s24, s25
+; GCN-NEXT:    s_mul_hi_u32 s29, s24, s28
+; GCN-NEXT:    s_mul_hi_u32 s26, s24, s25
 ; GCN-NEXT:    s_add_u32 s27, s29, s27
 ; GCN-NEXT:    s_addc_u32 s26, 0, s26
-; GCN-NEXT:    s_mul_hi_u32 s30, s25, s28
-; GCN-NEXT:    s_mul_i32 s28, s25, s28
+; GCN-NEXT:    s_mul_hi_u32 s30, s23, s28
+; GCN-NEXT:    s_mul_i32 s28, s23, s28
 ; GCN-NEXT:    s_add_u32 s27, s27, s28
-; GCN-NEXT:    s_mul_hi_u32 s29, s25, s23
+; GCN-NEXT:    s_mul_hi_u32 s29, s23, s25
 ; GCN-NEXT:    s_addc_u32 s26, s26, s30
 ; GCN-NEXT:    s_addc_u32 s27, s29, 0
-; GCN-NEXT:    s_mul_i32 s23, s25, s23
-; GCN-NEXT:    s_add_u32 s23, s26, s23
+; GCN-NEXT:    s_mul_i32 s25, s23, s25
+; GCN-NEXT:    s_add_u32 s25, s26, s25
 ; GCN-NEXT:    s_addc_u32 s26, 0, s27
-; GCN-NEXT:    s_add_u32 s27, s22, s23
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT:    s_addc_u32 s25, s25, s26
-; GCN-NEXT:    s_mul_i32 s22, s17, s25
-; GCN-NEXT:    s_mul_hi_u32 s23, s17, s27
-; GCN-NEXT:    s_add_i32 s22, s23, s22
-; GCN-NEXT:    s_mul_i32 s24, s24, s27
-; GCN-NEXT:    s_add_i32 s22, s22, s24
-; GCN-NEXT:    s_mul_i32 s17, s17, s27
-; GCN-NEXT:    s_mul_hi_u32 s24, s25, s17
-; GCN-NEXT:    s_mul_i32 s26, s25, s17
-; GCN-NEXT:    s_mul_i32 s29, s27, s22
-; GCN-NEXT:    s_mul_hi_u32 s17, s27, s17
-; GCN-NEXT:    s_mul_hi_u32 s28, s27, s22
+; GCN-NEXT:    s_add_u32 s24, s24, s25
+; GCN-NEXT:    s_addc_u32 s23, s23, s26
+; GCN-NEXT:    s_mul_i32 s25, s17, s23
+; GCN-NEXT:    s_mul_hi_u32 s26, s17, s24
+; GCN-NEXT:    s_add_i32 s25, s26, s25
+; GCN-NEXT:    s_mul_i32 s22, s22, s24
+; GCN-NEXT:    s_add_i32 s25, s25, s22
+; GCN-NEXT:    s_mul_i32 s17, s17, s24
+; GCN-NEXT:    s_mul_hi_u32 s26, s23, s17
+; GCN-NEXT:    s_mul_i32 s27, s23, s17
+; GCN-NEXT:    s_mul_i32 s29, s24, s25
+; GCN-NEXT:    s_mul_hi_u32 s17, s24, s17
+; GCN-NEXT:    s_mul_hi_u32 s28, s24, s25
 ; GCN-NEXT:    s_add_u32 s17, s17, s29
 ; GCN-NEXT:    s_addc_u32 s28, 0, s28
-; GCN-NEXT:    s_add_u32 s17, s17, s26
-; GCN-NEXT:    s_mul_hi_u32 s23, s25, s22
-; GCN-NEXT:    s_addc_u32 s17, s28, s24
-; GCN-NEXT:    s_addc_u32 s23, s23, 0
-; GCN-NEXT:    s_mul_i32 s22, s25, s22
-; GCN-NEXT:    s_add_u32 s17, s17, s22
-; GCN-NEXT:    s_addc_u32 s24, 0, s23
-; GCN-NEXT:    s_add_u32 s17, s27, s17
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT:    s_addc_u32 s26, s25, s24
+; GCN-NEXT:    s_add_u32 s17, s17, s27
+; GCN-NEXT:    s_mul_hi_u32 s22, s23, s25
+; GCN-NEXT:    s_addc_u32 s17, s28, s26
+; GCN-NEXT:    s_addc_u32 s22, s22, 0
+; GCN-NEXT:    s_mul_i32 s25, s23, s25
+; GCN-NEXT:    s_add_u32 s17, s17, s25
+; GCN-NEXT:    s_addc_u32 s22, 0, s22
+; GCN-NEXT:    s_add_u32 s17, s24, s17
+; GCN-NEXT:    s_addc_u32 s26, s23, s22
 ; GCN-NEXT:    s_ashr_i32 s22, s19, 31
 ; GCN-NEXT:    s_add_u32 s24, s18, s22
 ; GCN-NEXT:    s_mov_b32 s23, s22
@@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s17, s6, s17
 ; GCN-NEXT:    s_sub_u32 s17, s24, s17
 ; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s24, s28, s7
 ; GCN-NEXT:    s_sub_u32 s30, s17, s6
 ; GCN-NEXT:    s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
 ; GCN-NEXT:    s_subb_u32 s31, s24, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s31, s7
 ; GCN-NEXT:    s_cselect_b32 s33, -1, 0
@@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s33, s34, s33
 ; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
 ; GCN-NEXT:    s_subb_u32 s24, s24, s7
-; GCN-NEXT:    s_sub_u32 s34, s30, s6
-; GCN-NEXT:    s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
+; GCN-NEXT:    s_sub_u32 s28, s30, s6
 ; GCN-NEXT:    s_subb_u32 s24, s24, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s33, 0
-; GCN-NEXT:    s_cselect_b32 s28, s34, s30
+; GCN-NEXT:    s_cselect_b32 s28, s28, s30
 ; GCN-NEXT:    s_cselect_b32 s24, s24, s31
 ; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s19, s25, s19
@@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s18
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s19
 ; GCN-NEXT:    s_sub_u32 s13, 0, s18
-; GCN-NEXT:    s_subb_u32 s22, 0, s19
+; GCN-NEXT:    s_subb_u32 s20, 0, s19
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s23, v1
-; GCN-NEXT:    v_readfirstlane_b32 s20, v0
-; GCN-NEXT:    s_mul_i32 s21, s13, s23
-; GCN-NEXT:    s_mul_hi_u32 s25, s13, s20
-; GCN-NEXT:    s_mul_i32 s24, s22, s20
-; GCN-NEXT:    s_add_i32 s21, s25, s21
-; GCN-NEXT:    s_add_i32 s21, s21, s24
-; GCN-NEXT:    s_mul_i32 s26, s13, s20
-; GCN-NEXT:    s_mul_i32 s25, s20, s21
-; GCN-NEXT:    s_mul_hi_u32 s27, s20, s26
-; GCN-NEXT:    s_mul_hi_u32 s24, s20, s21
+; GCN-NEXT:    v_readfirstlane_b32 s21, v1
+; GCN-NEXT:    v_readfirstlane_b32 s22, v0
+; GCN-NEXT:    s_mul_i32 s23, s13, s21
+; GCN-NEXT:    s_mul_hi_u32 s25, s13, s22
+; GCN-NEXT:    s_mul_i32 s24, s20, s22
+; GCN-NEXT:    s_add_i32 s23, s25, s23
+; GCN-NEXT:    s_add_i32 s23, s23, s24
+; GCN-NEXT:    s_mul_i32 s26, s13, s22
+; GCN-NEXT:    s_mul_i32 s25, s22, s23
+; GCN-NEXT:    s_mul_hi_u32 s27, s22, s26
+; GCN-NEXT:    s_mul_hi_u32 s24, s22, s23
 ; GCN-NEXT:    s_add_u32 s25, s27, s25
 ; GCN-NEXT:    s_addc_u32 s24, 0, s24
-; GCN-NEXT:    s_mul_hi_u32 s28, s23, s26
-; GCN-NEXT:    s_mul_i32 s26, s23, s26
+; GCN-NEXT:    s_mul_hi_u32 s28, s21, s26
+; GCN-NEXT:    s_mul_i32 s26, s21, s26
 ; GCN-NEXT:    s_add_u32 s25, s25, s26
-; GCN-NEXT:    s_mul_hi_u32 s27, s23, s21
+; GCN-NEXT:    s_mul_hi_u32 s27, s21, s23
 ; GCN-NEXT:    s_addc_u32 s24, s24, s28
 ; GCN-NEXT:    s_addc_u32 s25, s27, 0
-; GCN-NEXT:    s_mul_i32 s21, s23, s21
-; GCN-NEXT:    s_add_u32 s21, s24, s21
+; GCN-NEXT:    s_mul_i32 s23, s21, s23
+; GCN-NEXT:    s_add_u32 s23, s24, s23
 ; GCN-NEXT:    s_addc_u32 s24, 0, s25
-; GCN-NEXT:    s_add_u32 s25, s20, s21
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT:    s_addc_u32 s23, s23, s24
-; GCN-NEXT:    s_mul_i32 s20, s13, s23
-; GCN-NEXT:    s_mul_hi_u32 s21, s13, s25
-; GCN-NEXT:    s_add_i32 s20, s21, s20
-; GCN-NEXT:    s_mul_i32 s22, s22, s25
-; GCN-NEXT:    s_add_i32 s20, s20, s22
-; GCN-NEXT:    s_mul_i32 s13, s13, s25
-; GCN-NEXT:    s_mul_hi_u32 s22, s23, s13
-; GCN-NEXT:    s_mul_i32 s24, s23, s13
-; GCN-NEXT:    s_mul_i32 s27, s25, s20
-; GCN-NEXT:    s_mul_hi_u32 s13, s25, s13
-; GCN-NEXT:    s_mul_hi_u32 s26, s25, s20
+; GCN-NEXT:    s_add_u32 s22, s22, s23
+; GCN-NEXT:    s_addc_u32 s21, s21, s24
+; GCN-NEXT:    s_mul_i32 s23, s13, s21
+; GCN-NEXT:    s_mul_hi_u32 s24, s13, s22
+; GCN-NEXT:    s_add_i32 s23, s24, s23
+; GCN-NEXT:    s_mul_i32 s20, s20, s22
+; GCN-NEXT:    s_add_i32 s23, s23, s20
+; GCN-NEXT:    s_mul_i32 s13, s13, s22
+; GCN-NEXT:    s_mul_hi_u32 s24, s21, s13
+; GCN-NEXT:    s_mul_i32 s25, s21, s13
+; GCN-NEXT:    s_mul_i32 s27, s22, s23
+; GCN-NEXT:    s_mul_hi_u32 s13, s22, s13
+; GCN-NEXT:    s_mul_hi_u32 s26, s22, s23
 ; GCN-NEXT:    s_add_u32 s13, s13, s27
 ; GCN-NEXT:    s_addc_u32 s26, 0, s26
-; GCN-NEXT:    s_add_u32 s13, s13, s24
-; GCN-NEXT:    s_mul_hi_u32 s21, s23, s20
-; GCN-NEXT:    s_addc_u32 s13, s26, s22
-; GCN-NEXT:    s_addc_u32 s21, s21, 0
-; GCN-NEXT:    s_mul_i32 s20, s23, s20
-; GCN-NEXT:    s_add_u32 s13, s13, s20
-; GCN-NEXT:    s_addc_u32 s22, 0, s21
-; GCN-NEXT:    s_add_u32 s13, s25, s13
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT:    s_addc_u32 s24, s23, s22
+; GCN-NEXT:    s_add_u32 s13, s13, s25
+; GCN-NEXT:    s_mul_hi_u32 s20, s21, s23
+; GCN-NEXT:    s_addc_u32 s13, s26, s24
+; GCN-NEXT:    s_addc_u32 s20, s20, 0
+; GCN-NEXT:    s_mul_i32 s23, s21, s23
+; GCN-NEXT:    s_add_u32 s13, s13, s23
+; GCN-NEXT:    s_addc_u32 s20, 0, s20
+; GCN-NEXT:    s_add_u32 s13, s22, s13
+; GCN-NEXT:    s_addc_u32 s24, s21, s20
 ; GCN-NEXT:    s_ashr_i32 s20, s15, 31
 ; GCN-NEXT:    s_add_u32 s22, s14, s20
 ; GCN-NEXT:    s_mov_b32 s21, s20
@@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s13, s18, s13
 ; GCN-NEXT:    s_sub_u32 s13, s22, s13
 ; GCN-NEXT:    s_cselect_b64 s[24:25], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[24:25], 0
 ; GCN-NEXT:    s_subb_u32 s22, s26, s19
 ; GCN-NEXT:    s_sub_u32 s28, s13, s18
 ; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s29, s22, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s29, s19
 ; GCN-NEXT:    s_cselect_b32 s30, -1, 0
@@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s30, s31, s30
 ; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s22, s22, s19
-; GCN-NEXT:    s_sub_u32 s31, s28, s18
-; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
+; GCN-NEXT:    s_sub_u32 s26, s28, s18
 ; GCN-NEXT:    s_subb_u32 s22, s22, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s30, 0
-; GCN-NEXT:    s_cselect_b32 s26, s31, s28
+; GCN-NEXT:    s_cselect_b32 s26, s26, s28
 ; GCN-NEXT:    s_cselect_b32 s22, s22, s29
 ; GCN-NEXT:    s_cmp_lg_u64 s[24:25], 0
 ; GCN-NEXT:    s_subb_u32 s15, s23, s15
@@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
 ; GCN-NEXT:    s_sub_u32 s9, 0, s14
-; GCN-NEXT:    s_subb_u32 s18, 0, s15
+; GCN-NEXT:    s_subb_u32 s16, 0, s15
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s19, v1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v0
-; GCN-NEXT:    s_mul_i32 s17, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s21, s9, s16
-; GCN-NEXT:    s_mul_i32 s20, s18, s16
-; GCN-NEXT:    s_add_i32 s17, s21, s17
-; GCN-NEXT:    s_add_i32 s17, s17, s20
-; GCN-NEXT:    s_mul_i32 s22, s9, s16
-; GCN-NEXT:    s_mul_i32 s21, s16, s17
-; GCN-NEXT:    s_mul_hi_u32 s23, s16, s22
-; GCN-NEXT:    s_mul_hi_u32 s20, s16, s17
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_readfirstlane_b32 s18, v0
+; GCN-NEXT:    s_mul_i32 s19, s9, s17
+; GCN-NEXT:    s_mul_hi_u32 s21, s9, s18
+; GCN-NEXT:    s_mul_i32 s20, s16, s18
+; GCN-NEXT:    s_add_i32 s19, s21, s19
+; GCN-NEXT:    s_add_i32 s19, s19, s20
+; GCN-NEXT:    s_mul_i32 s22, s9, s18
+; GCN-NEXT:    s_mul_i32 s21, s18, s19
+; GCN-NEXT:    s_mul_hi_u32 s23, s18, s22
+; GCN-NEXT:    s_mul_hi_u32 s20, s18, s19
 ; GCN-NEXT:    s_add_u32 s21, s23, s21
 ; GCN-NEXT:    s_addc_u32 s20, 0, s20
-; GCN-NEXT:    s_mul_hi_u32 s24, s19, s22
-; GCN-NEXT:    s_mul_i32 s22, s19, s22
+; GCN-NEXT:    s_mul_hi_u32 s24, s17, s22
+; GCN-NEXT:    s_mul_i32 s22, s17, s22
 ; GCN-NEXT:    s_add_u32 s21, s21, s22
-; GCN-NEXT:    s_mul_hi_u32 s23, s19, s17
+; GCN-NEXT:    s_mul_hi_u32 s23, s17, s19
 ; GCN-NEXT:    s_addc_u32 s20, s20, s24
 ; GCN-NEXT:    s_addc_u32 s21, s23, 0
-; GCN-NEXT:    s_mul_i32 s17, s19, s17
-; GCN-NEXT:    s_add_u32 s17, s20, s17
+; GCN-NEXT:    s_mul_i32 s19, s17, s19
+; GCN-NEXT:    s_add_u32 s19, s20, s19
 ; GCN-NEXT:    s_addc_u32 s20, 0, s21
-; GCN-NEXT:    s_add_u32 s21, s16, s17
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT:    s_addc_u32 s19, s19, s20
-; GCN-NEXT:    s_mul_i32 s16, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s17, s9, s21
-; GCN-NEXT:    s_add_i32 s16, s17, s16
-; GCN-NEXT:    s_mul_i32 s18, s18, s21
-; GCN-NEXT:    s_add_i32 s16, s16, s18
-; GCN-NEXT:    s_mul_i32 s9, s9, s21
-; GCN-NEXT:    s_mul_hi_u32 s18, s19, s9
-; GCN-NEXT:    s_mul_i32 s20, s19, s9
-; GCN-NEXT:    s_mul_i32 s23, s21, s16
-; GCN-NEXT:    s_mul_hi_u32 s9, s21, s9
-; GCN-NEXT:    s_mul_hi_u32 s22, s21, s16
+; GCN-NEXT:    s_add_u32 s18, s18, s19
+; GCN-NEXT:    s_addc_u32 s17, s17, s20
+; GCN-NEXT:    s_mul_i32 s19, s9, s17
+; GCN-NEXT:    s_mul_hi_u32 s20, s9, s18
+; GCN-NEXT:    s_add_i32 s19, s20, s19
+; GCN-NEXT:    s_mul_i32 s16, s16, s18
+; GCN-NEXT:    s_add_i32 s19, s19, s16
+; GCN-NEXT:    s_mul_i32 s9, s9, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s17, s9
+; GCN-NEXT:    s_mul_i32 s21, s17, s9
+; GCN-NEXT:    s_mul_i32 s23, s18, s19
+; GCN-NEXT:    s_mul_hi_u32 s9, s18, s9
+; GCN-NEXT:    s_mul_hi_u32 s22, s18, s19
 ; GCN-NEXT:    s_add_u32 s9, s9, s23
 ; GCN-NEXT:    s_addc_u32 s22, 0, s22
-; GCN-NEXT:    s_add_u32 s9, s9, s20
-; GCN-NEXT:    s_mul_hi_u32 s17, s19, s16
-; GCN-NEXT:    s_addc_u32 s9, s22, s18
-; GCN-NEXT:    s_addc_u32 s17, s17, 0
-; GCN-NEXT:    s_mul_i32 s16, s19, s16
-; GCN-NEXT:    s_add_u32 s9, s9, s16
-; GCN-NEXT:    s_addc_u32 s18, 0, s17
-; GCN-NEXT:    s_add_u32 s9, s21, s9
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT:    s_addc_u32 s20, s19, s18
+; GCN-NEXT:    s_add_u32 s9, s9, s21
+; GCN-NEXT:    s_mul_hi_u32 s16, s17, s19
+; GCN-NEXT:    s_addc_u32 s9, s22, s20
+; GCN-NEXT:    s_addc_u32 s16, s16, 0
+; GCN-NEXT:    s_mul_i32 s19, s17, s19
+; GCN-NEXT:    s_add_u32 s9, s9, s19
+; GCN-NEXT:    s_addc_u32 s16, 0, s16
+; GCN-NEXT:    s_add_u32 s9, s18, s9
+; GCN-NEXT:    s_addc_u32 s20, s17, s16
 ; GCN-NEXT:    s_ashr_i32 s16, s11, 31
 ; GCN-NEXT:    s_add_u32 s18, s10, s16
 ; GCN-NEXT:    s_mov_b32 s17, s16
@@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s9, s14, s9
 ; GCN-NEXT:    s_sub_u32 s9, s18, s9
 ; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s18, s22, s15
 ; GCN-NEXT:    s_sub_u32 s24, s9, s14
 ; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
 ; GCN-NEXT:    s_subb_u32 s25, s18, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s25, s15
 ; GCN-NEXT:    s_cselect_b32 s26, -1, 0
@@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s26, s27, s26
 ; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
 ; GCN-NEXT:    s_subb_u32 s18, s18, s15
-; GCN-NEXT:    s_sub_u32 s27, s24, s14
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT:    s_sub_u32 s22, s24, s14
 ; GCN-NEXT:    s_subb_u32 s18, s18, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s26, 0
-; GCN-NEXT:    s_cselect_b32 s22, s27, s24
+; GCN-NEXT:    s_cselect_b32 s22, s22, s24
 ; GCN-NEXT:    s_cselect_b32 s18, s18, s25
 ; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s11, s19, s11
@@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GCN-NEXT:    s_sub_u32 s3, 0, s10
-; GCN-NEXT:    s_subb_u32 s14, 0, s11
+; GCN-NEXT:    s_subb_u32 s12, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v1
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s13, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT:    s_mul_i32 s16, s14, s12
-; GCN-NEXT:    s_add_i32 s13, s17, s13
-; GCN-NEXT:    s_add_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s18, s3, s12
-; GCN-NEXT:    s_mul_i32 s17, s12, s13
-; GCN-NEXT:    s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT:    s_mul_i32 s16, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s17, s15
+; GCN-NEXT:    s_add_i32 s15, s15, s16
+; GCN-NEXT:    s_mul_i32 s18, s3, s14
+; GCN-NEXT:    s_mul_i32 s17, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT:    s_mul_hi_u32 s16, s14, s15
 ; GCN-NEXT:    s_add_u32 s17, s19, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT:    s_mul_i32 s18, s15, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT:    s_mul_i32 s18, s13, s18
 ; GCN-NEXT:    s_add_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT:    s_mul_hi_u32 s19, s13, s15
 ; GCN-NEXT:    s_addc_u32 s16, s16, s20
 ; GCN-NEXT:    s_addc_u32 s17, s19, 0
-; GCN-NEXT:    s_mul_i32 s13, s15, s13
-; GCN-NEXT:    s_add_u32 s13, s16, s13
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s15, s16, s15
 ; GCN-NEXT:    s_addc_u32 s16, 0, s17
-; GCN-NEXT:    s_add_u32 s17, s12, s13
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_i32 s12, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT:    s_add_i32 s12, s13, s12
-; GCN-NEXT:    s_mul_i32 s14, s14, s17
-; GCN-NEXT:    s_add_i32 s12, s12, s14
-; GCN-NEXT:    s_mul_i32 s3, s3, s17
-; GCN-NEXT:    s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT:    s_mul_i32 s16, s15, s3
-; GCN-NEXT:    s_mul_i32 s19, s17, s12
-; GCN-NEXT:    s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT:    s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s13, s13, s16
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT:    s_add_i32 s15, s16, s15
+; GCN-NEXT:    s_mul_i32 s12, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s15, s12
+; GCN-NEXT:    s_mul_i32 s3, s3, s14
+; GCN-NEXT:    s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT:    s_mul_i32 s17, s13, s3
+; GCN-NEXT:    s_mul_i32 s19, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
 ; GCN-NEXT:    s_add_u32 s3, s3, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_add_u32 s3, s3, s16
-; GCN-NEXT:    s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT:    s_addc_u32 s3, s18, s14
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s12, s15, s12
-; GCN-NEXT:    s_add_u32 s3, s3, s12
-; GCN-NEXT:    s_addc_u32 s14, 0, s13
-; GCN-NEXT:    s_add_u32 s3, s17, s3
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s16, s15, s14
+; GCN-NEXT:    s_add_u32 s3, s3, s17
+; GCN-NEXT:    s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT:    s_addc_u32 s3, s18, s16
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    s_add_u32 s3, s14, s3
+; GCN-NEXT:    s_addc_u32 s16, s13, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s5, 31
 ; GCN-NEXT:    s_add_u32 s14, s4, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
@@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s3, s10, s3
 ; GCN-NEXT:    s_sub_u32 s3, s14, s3
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s14, s18, s11
 ; GCN-NEXT:    s_sub_u32 s20, s3, s10
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s21, s14, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s21, s11
 ; GCN-NEXT:    s_cselect_b32 s22, -1, 0
@@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s22, s23, s22
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s14, s14, s11
-; GCN-NEXT:    s_sub_u32 s23, s20, s10
-; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT:    s_sub_u32 s18, s20, s10
 ; GCN-NEXT:    s_subb_u32 s14, s14, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s22, 0
-; GCN-NEXT:    s_cselect_b32 s18, s23, s20
+; GCN-NEXT:    s_cselect_b32 s18, s18, s20
 ; GCN-NEXT:    s_cselect_b32 s14, s14, s21
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s5, s15, s5
@@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v8
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s1, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s1, s1, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s1, s1, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 33b0a5d..ea9bb04 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s0, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_add_u32 s11, s14, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s1, s12, s10
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v0
 ; GCN-NEXT:    s_add_i32 s5, s10, s5
 ; GCN-NEXT:    s_mul_i32 s10, s9, s4
-; GCN-NEXT:    s_add_i32 s10, s5, s10
-; GCN-NEXT:    s_sub_i32 s11, s7, s10
+; GCN-NEXT:    s_add_i32 s12, s5, s10
+; GCN-NEXT:    s_sub_i32 s10, s7, s12
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s13, s6, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s4, s5
+; GCN-NEXT:    s_subb_u32 s13, s10, s9
+; GCN-NEXT:    s_sub_u32 s14, s6, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s8
+; GCN-NEXT:    s_cselect_b32 s17, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, s17, s16
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s13, s13, s9
+; GCN-NEXT:    s_sub_u32 s17, s14, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_cmp_lg_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s14, s11, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s9
+; GCN-NEXT:    s_subb_u32 s4, s7, s12
+; GCN-NEXT:    s_cmp_ge_u32 s4, s9
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s8
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s9
-; GCN-NEXT:    s_cselect_b32 s15, s15, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s16, s13, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s11, 0
-; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s5, s16, s13
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-NEXT:    s_cmp_ge_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s8
-; GCN-NEXT:    s_cselect_b32 s8, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s8, s8, s10
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s4, s4, s7
-; GCN-NEXT:    s_cselect_b32 s5, s5, s6
+; GCN-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s4, s9
+; GCN-NEXT:    s_cselect_b32 s5, s7, s5
+; GCN-NEXT:    s_cmp_lg_u32 s5, 0
+; GCN-NEXT:    s_cselect_b32 s4, s10, s4
+; GCN-NEXT:    s_cselect_b32 s5, s11, s6
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s8, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s8, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
@@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_add_u32 s11, s14, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s10, s12, s10
 ; GCN-NEXT:    s_ashr_i32 s8, s7, 31
 ; GCN-NEXT:    s_add_u32 s6, s6, s8
@@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v0
 ; GCN-NEXT:    s_add_i32 s11, s12, s11
 ; GCN-NEXT:    s_mul_i32 s12, s5, s10
-; GCN-NEXT:    s_add_i32 s12, s11, s12
-; GCN-NEXT:    s_sub_i32 s13, s7, s12
+; GCN-NEXT:    s_add_i32 s14, s11, s12
+; GCN-NEXT:    s_sub_i32 s12, s7, s14
 ; GCN-NEXT:    s_mul_i32 s10, s4, s10
 ; GCN-NEXT:    s_sub_u32 s6, s6, s10
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s13, s13, s5
-; GCN-NEXT:    s_sub_u32 s15, s6, s4
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s16, s13, 0
-; GCN-NEXT:    s_cmp_ge_u32 s16, s5
-; GCN-NEXT:    s_cselect_b32 s11, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s15, s4
-; GCN-NEXT:    s_cselect_b32 s17, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s16, s5
-; GCN-NEXT:    s_cselect_b32 s17, s17, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s13, s13, s5
-; GCN-NEXT:    s_sub_u32 s18, s15, s4
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s13, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s12, s5
+; GCN-NEXT:    s_sub_u32 s16, s6, s4
+; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_or_b32 s17, s12, s13
+; GCN-NEXT:    s_subb_u32 s17, s15, 0
+; GCN-NEXT:    s_cmp_ge_u32 s17, s5
+; GCN-NEXT:    s_cselect_b32 s18, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s16, s4
+; GCN-NEXT:    s_cselect_b32 s19, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s17, s5
+; GCN-NEXT:    s_cselect_b32 s18, s19, s18
+; GCN-NEXT:    s_or_b32 s12, s12, s13
+; GCN-NEXT:    s_subb_u32 s15, s15, s5
+; GCN-NEXT:    s_sub_u32 s19, s16, s4
+; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_or_b32 s12, s12, s13
+; GCN-NEXT:    s_subb_u32 s12, s15, 0
+; GCN-NEXT:    s_cmp_lg_u32 s18, 0
+; GCN-NEXT:    s_cselect_b32 s13, s19, s16
+; GCN-NEXT:    s_cselect_b32 s12, s12, s17
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s10, s13, 0
-; GCN-NEXT:    s_cmp_lg_u32 s17, 0
-; GCN-NEXT:    s_cselect_b32 s11, s18, s15
-; GCN-NEXT:    s_cselect_b32 s10, s10, s16
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s12
+; GCN-NEXT:    s_subb_u32 s7, s7, s14
 ; GCN-NEXT:    s_cmp_ge_u32 s7, s5
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
+; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s4
 ; GCN-NEXT:    s_cselect_b32 s4, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s7, s5
-; GCN-NEXT:    s_cselect_b32 s4, s4, s12
+; GCN-NEXT:    s_cselect_b32 s4, s4, s10
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s5, s10, s7
-; GCN-NEXT:    s_cselect_b32 s4, s11, s6
+; GCN-NEXT:    s_cselect_b32 s5, s12, s7
+; GCN-NEXT:    s_cselect_b32 s4, s13, s6
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s8
 ; GCN-NEXT:    s_subb_u32 s5, s5, s8
@@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s10, 0
 ; GCN-IR-NEXT:    s_addc_u32 s10, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
@@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_add_u32 s18, s18, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
-; GCN-IR-NEXT:    s_cmp_lg_u32 s20, 0
 ; GCN-IR-NEXT:    s_addc_u32 s19, s19, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
@@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s6, s7
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s6, s2, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
@@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s2, s11, s2
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_addc_u32 s6, s9, s8
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, 24
@@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mul_i32 s7, s5, s6
 ; GCN-NEXT:    s_mul_i32 s6, s4, s6
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_add_i32 s8, s8, s7
-; GCN-NEXT:    s_sub_i32 s9, 0, s8
-; GCN-NEXT:    s_sub_u32 s10, 24, s6
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s9, s9, s5
-; GCN-NEXT:    s_sub_u32 s12, s10, s4
+; GCN-NEXT:    s_add_i32 s10, s8, s7
+; GCN-NEXT:    s_sub_i32 s8, 0, s10
+; GCN-NEXT:    s_sub_u32 s11, 24, s6
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT:    s_or_b32 s9, s6, s7
+; GCN-NEXT:    s_subb_u32 s12, s8, s5
+; GCN-NEXT:    s_sub_u32 s13, s11, s4
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s8, s9
+; GCN-NEXT:    s_subb_u32 s14, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s5
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s13, s4
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s5
+; GCN-NEXT:    s_cselect_b32 s15, s16, s15
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s12, s12, s5
+; GCN-NEXT:    s_sub_u32 s16, s13, s4
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_cmp_lg_u32 s15, 0
+; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s13, s9, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s5
+; GCN-NEXT:    s_subb_u32 s6, 0, s10
+; GCN-NEXT:    s_cmp_ge_u32 s6, s5
 ; GCN-NEXT:    s_cselect_b32 s7, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s4
-; GCN-NEXT:    s_cselect_b32 s14, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s13, s5
-; GCN-NEXT:    s_cselect_b32 s14, s14, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s9, s9, s5
-; GCN-NEXT:    s_sub_u32 s15, s12, s4
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s6, s9, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s7, s15, s12
-; GCN-NEXT:    s_cselect_b32 s6, s6, s13
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s8, 0, s8
-; GCN-NEXT:    s_cmp_ge_u32 s8, s5
-; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s10, s4
+; GCN-NEXT:    s_cmp_ge_u32 s11, s4
 ; GCN-NEXT:    s_cselect_b32 s4, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s5
-; GCN-NEXT:    s_cselect_b32 s4, s4, s9
+; GCN-NEXT:    s_cmp_eq_u32 s6, s5
+; GCN-NEXT:    s_cselect_b32 s4, s4, s7
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s6, s8
-; GCN-NEXT:    s_cselect_b32 s5, s7, s10
+; GCN-NEXT:    s_cselect_b32 s4, s8, s6
+; GCN-NEXT:    s_cselect_b32 s5, s9, s11
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s8, s2, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s9, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 0
 ; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s2, 63, s2
@@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bb5918b2..bdd22f25 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_or_b32 s0, s0, s1
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_addc_u32 s3, s3, s9
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_add_u32 s2, s2, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; VI-NEXT:    s_addc_u32 s3, s3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s6, s2, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s4, s3, s7
+; GFX9-NEXT:    s_add_u32 s4, s2, s6
+; GFX9-NEXT:    s_addc_u32 s5, s3, s7
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s2, s2, s4
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-NEXT:    s_addc_u32 s3, s3, s5
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_add_u32 s4, s4, s6
 ; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; SI-NEXT:    s_or_b32 s6, s12, s13
-; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_addc_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
@@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_add_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_addc_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_addc_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_add_u32 s0, s12, s14
+; GFX9-NEXT:    s_addc_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s0, s12, s14
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s13, s15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 41199b0..fd461ac 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[2:3], s[4:5]
@@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s4, s6, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
@@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s8, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s4, s9, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
@@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s8, 0, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    s_addc_u32 s10, 0, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mul_i32 s0, s3, s8
+; GCN-NEXT:    s_mul_i32 s0, s3, s10
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s9, s1, s0
-; GCN-NEXT:    s_sub_i32 s10, 0, s9
-; GCN-NEXT:    s_mul_i32 s0, s2, s8
-; GCN-NEXT:    s_sub_u32 s11, 24, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s13, s11, s2
+; GCN-NEXT:    s_add_i32 s11, s1, s0
+; GCN-NEXT:    s_sub_i32 s8, 0, s11
+; GCN-NEXT:    s_mul_i32 s0, s2, s10
+; GCN-NEXT:    s_sub_u32 s12, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_or_b32 s9, s0, s1
+; GCN-NEXT:    s_subb_u32 s13, s8, s3
+; GCN-NEXT:    s_sub_u32 s14, s12, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s8, s3
+; GCN-NEXT:    s_cselect_b32 s9, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s2
+; GCN-NEXT:    s_cselect_b32 s13, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, s3
+; GCN-NEXT:    s_cselect_b32 s8, s13, s9
+; GCN-NEXT:    s_add_u32 s9, s10, 1
+; GCN-NEXT:    s_addc_u32 s13, 0, 0
+; GCN-NEXT:    s_add_u32 s14, s10, 2
+; GCN-NEXT:    s_addc_u32 s15, 0, 0
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
+; GCN-NEXT:    s_cselect_b32 s8, s14, s9
+; GCN-NEXT:    s_cselect_b32 s9, s15, s13
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s0, s10, 0
+; GCN-NEXT:    s_subb_u32 s0, 0, s11
 ; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s2
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s12, s2
+; GCN-NEXT:    s_cselect_b32 s2, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, s3
-; GCN-NEXT:    s_cselect_b32 s0, s10, s1
-; GCN-NEXT:    s_add_u32 s1, s8, 1
-; GCN-NEXT:    s_addc_u32 s10, 0, 0
-; GCN-NEXT:    s_add_u32 s13, s8, 2
-; GCN-NEXT:    s_addc_u32 s14, 0, 0
+; GCN-NEXT:    s_cselect_b32 s0, s2, s1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s13, s1
-; GCN-NEXT:    s_cselect_b32 s1, s14, s10
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s9, 0, s9
-; GCN-NEXT:    s_cmp_ge_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s11, s2
-; GCN-NEXT:    s_cselect_b32 s2, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s2, s2, s10
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, s8
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_cselect_b32 s0, s9, 0
+; GCN-NEXT:    s_cselect_b32 s1, s8, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s12, s12, s13
-; GCN-IR-NEXT:    s_cmp_lg_u32 s12, 0
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 9bcba6c..2d7ce10 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 define amdgpu_kernel void @uitofp_i16_to_f16(
 ; SI-LABEL: uitofp_i16_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index cdcc914..137dc1f 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s0, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_add_u32 s11, s14, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s1, s12, s10
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v0
 ; GCN-NEXT:    s_add_i32 s5, s10, s5
 ; GCN-NEXT:    s_mul_i32 s10, s9, s4
-; GCN-NEXT:    s_add_i32 s10, s5, s10
-; GCN-NEXT:    s_sub_i32 s11, s7, s10
+; GCN-NEXT:    s_add_i32 s12, s5, s10
+; GCN-NEXT:    s_sub_i32 s10, s7, s12
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s13, s6, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s4, s5
+; GCN-NEXT:    s_subb_u32 s13, s10, s9
+; GCN-NEXT:    s_sub_u32 s14, s6, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s8
+; GCN-NEXT:    s_cselect_b32 s17, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, s17, s16
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s13, s13, s9
+; GCN-NEXT:    s_sub_u32 s17, s14, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_cmp_lg_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s14, s11, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s9
+; GCN-NEXT:    s_subb_u32 s4, s7, s12
+; GCN-NEXT:    s_cmp_ge_u32 s4, s9
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s8
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s9
-; GCN-NEXT:    s_cselect_b32 s15, s15, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s16, s13, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s11, 0
-; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s5, s16, s13
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-NEXT:    s_cmp_ge_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s8
-; GCN-NEXT:    s_cselect_b32 s8, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s8, s8, s10
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s4, s4, s7
-; GCN-NEXT:    s_cselect_b32 s5, s5, s6
+; GCN-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s4, s9
+; GCN-NEXT:    s_cselect_b32 s5, s7, s5
+; GCN-NEXT:    s_cmp_lg_u32 s5, 0
+; GCN-NEXT:    s_cselect_b32 s4, s10, s4
+; GCN-NEXT:    s_cselect_b32 s5, s11, s6
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s4, s6, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
@@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s8, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s4, s9, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
@@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_mul_i32 s0, s3, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s9, s1, s0
-; GCN-NEXT:    s_sub_i32 s10, 0, s9
+; GCN-NEXT:    s_add_i32 s10, s1, s0
+; GCN-NEXT:    s_sub_i32 s9, 0, s10
 ; GCN-NEXT:    s_mul_i32 s0, s2, s8
-; GCN-NEXT:    s_sub_u32 s8, 24, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s12, s8, s2
+; GCN-NEXT:    s_sub_u32 s11, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s0, s1
+; GCN-NEXT:    s_subb_u32 s12, s9, s3
+; GCN-NEXT:    s_sub_u32 s13, s11, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s8, s9
+; GCN-NEXT:    s_subb_u32 s14, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s3
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s13, s2
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s3
+; GCN-NEXT:    s_cselect_b32 s15, s16, s15
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s12, s12, s3
+; GCN-NEXT:    s_sub_u32 s16, s13, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_cmp_lg_u32 s15, 0
+; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s13, s10, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s3
+; GCN-NEXT:    s_subb_u32 s0, 0, s10
+; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s2
-; GCN-NEXT:    s_cselect_b32 s14, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s13, s3
-; GCN-NEXT:    s_cselect_b32 s14, s14, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s15, s12, s2
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s0, s10, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s1, s15, s12
-; GCN-NEXT:    s_cselect_b32 s0, s0, s13
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s9, 0, s9
-; GCN-NEXT:    s_cmp_ge_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s8, s2
+; GCN-NEXT:    s_cmp_ge_u32 s11, s2
 ; GCN-NEXT:    s_cselect_b32 s2, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s2, s2, s10
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, s9
-; GCN-NEXT:    s_cselect_b32 s1, s1, s8
+; GCN-NEXT:    s_cmp_eq_u32 s0, s3
+; GCN-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-NEXT:    s_cmp_lg_u32 s1, 0
+; GCN-NEXT:    s_cselect_b32 s0, s8, s0
+; GCN-NEXT:    s_cselect_b32 s1, s9, s11
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s14, s14, s15
-; GCN-IR-NEXT:    s_cmp_lg_u32 s14, 0
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index d67a7b1..e8db647 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_or_b32 s0, s0, s1
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_subb_u32 s3, s3, s9
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_sub_u32 s2, s2, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; VI-NEXT:    s_subb_u32 s3, s3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s6, s2, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_subb_u32 s4, s3, s7
+; GFX9-NEXT:    s_sub_u32 s4, s2, s6
+; GFX9-NEXT:    s_subb_u32 s5, s3, s7
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sub_u32 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_subb_u32 s3, s3, s7
 ; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s2, s2, s4
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-NEXT:    s_subb_u32 s3, s3, s5
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_sub_u32 s4, s4, s6
 ; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; SI-NEXT:    s_or_b32 s6, s12, s13
-; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_subb_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
@@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_sub_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_subb_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_subb_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_sub_u32 s0, s12, s14
+; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sub_u32 s0, s12, s14
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_subb_u32 s1, s13, s15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_subb_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 75db387..28c6b40 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1032-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1032-NEXT:    s_add_u32 s8, s8, s11
-; GFX1032-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1032-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1032-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1032-NEXT:    s_addc_u32 s5, s5, s12
-; GFX1032-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1032-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1032-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1032-NEXT:    s_mul_i32 s9, s9, s5
-; GFX1032-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1032-NEXT:    s_add_i32 s9, s13, s9
-; GFX1032-NEXT:    s_mul_hi_u32 s13, s5, s11
+; GFX1032-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1032-NEXT:    s_add_i32 s9, s11, s9
+; GFX1032-NEXT:    s_mul_i32 s11, s5, s12
 ; GFX1032-NEXT:    s_add_i32 s9, s9, s10
-; GFX1032-NEXT:    s_mul_i32 s10, s5, s11
+; GFX1032-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1032-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1032-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1032-NEXT:    s_add_u32 s12, s12, s15
+; GFX1032-NEXT:    s_add_u32 s10, s10, s15
+; GFX1032-NEXT:    s_mul_hi_u32 s13, s5, s12
 ; GFX1032-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1032-NEXT:    s_mul_hi_u32 s11, s5, s9
-; GFX1032-NEXT:    s_add_u32 s10, s12, s10
+; GFX1032-NEXT:    s_mul_hi_u32 s12, s5, s9
+; GFX1032-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1032-NEXT:    s_mul_i32 s9, s5, s9
 ; GFX1032-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1032-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1032-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1032-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1032-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1032-NEXT:    s_add_u32 s8, s8, s9
-; GFX1032-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1032-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1032-NEXT:    s_addc_u32 s5, s5, s10
-; GFX1032-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1032-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1032-NEXT:    s_mul_i32 s12, s2, s5
-; GFX1032-NEXT:    s_mul_hi_u32 s10, s2, s5
-; GFX1032-NEXT:    s_add_u32 s11, s11, s12
-; GFX1032-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1032-NEXT:    s_mul_hi_u32 s11, s2, s5
+; GFX1032-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1032-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1032-NEXT:    s_add_u32 s9, s9, s12
+; GFX1032-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1032-NEXT:    s_mul_hi_u32 s13, s3, s5
-; GFX1032-NEXT:    s_add_u32 s8, s11, s8
+; GFX1032-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1032-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1032-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1032-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1032-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1032-NEXT:    s_add_u32 s5, s8, s5
 ; GFX1032-NEXT:    s_addc_u32 s8, 0, s9
@@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1032-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1032-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1032-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1032-NEXT:    s_subb_u32 s11, s11, s1
 ; GFX1032-NEXT:    s_sub_u32 s13, s10, s0
-; GFX1032-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1032-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1032-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1032-NEXT:    s_cmp_ge_u32 s11, s1
 ; GFX1032-NEXT:    s_cselect_b32 s14, -1, 0
@@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GFX1064-NEXT:    s_sub_u32 s9, 0, s0
-; GFX1064-NEXT:    s_subb_u32 s10, 0, s1
+; GFX1064-NEXT:    s_sub_u32 s8, 0, s0
+; GFX1064-NEXT:    s_subb_u32 s9, 0, s1
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1064-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1064-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX1064-NEXT:    s_mul_i32 s5, s9, s8
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s9, s4
-; GFX1064-NEXT:    s_mul_i32 s11, s10, s4
-; GFX1064-NEXT:    s_add_i32 s5, s12, s5
-; GFX1064-NEXT:    s_mul_i32 s13, s9, s4
-; GFX1064-NEXT:    s_add_i32 s5, s5, s11
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s4, s13
-; GFX1064-NEXT:    s_mul_i32 s15, s4, s5
-; GFX1064-NEXT:    s_mul_hi_u32 s14, s8, s13
-; GFX1064-NEXT:    s_mul_i32 s11, s8, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s4, s5
+; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1064-NEXT:    s_mul_i32 s10, s8, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s9, s5
+; GFX1064-NEXT:    s_add_i32 s10, s12, s10
+; GFX1064-NEXT:    s_mul_i32 s13, s8, s5
+; GFX1064-NEXT:    s_add_i32 s10, s10, s11
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s5, s13
+; GFX1064-NEXT:    s_mul_i32 s15, s5, s10
+; GFX1064-NEXT:    s_mul_hi_u32 s14, s4, s13
+; GFX1064-NEXT:    s_mul_i32 s11, s4, s13
+; GFX1064-NEXT:    s_mul_hi_u32 s13, s5, s10
 ; GFX1064-NEXT:    s_add_u32 s12, s12, s15
 ; GFX1064-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s16, s8, s5
+; GFX1064-NEXT:    s_mul_hi_u32 s16, s4, s10
 ; GFX1064-NEXT:    s_add_u32 s11, s12, s11
-; GFX1064-NEXT:    s_mul_i32 s5, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s10, s4, s10
 ; GFX1064-NEXT:    s_addc_u32 s11, s13, s14
 ; GFX1064-NEXT:    s_addc_u32 s12, s16, 0
-; GFX1064-NEXT:    s_add_u32 s5, s11, s5
+; GFX1064-NEXT:    s_add_u32 s10, s11, s10
 ; GFX1064-NEXT:    s_addc_u32 s11, 0, s12
-; GFX1064-NEXT:    s_add_u32 s12, s4, s5
-; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s9, s12
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_mul_i32 s4, s9, s12
-; GFX1064-NEXT:    s_addc_u32 s8, s8, s11
-; GFX1064-NEXT:    s_mul_i32 s10, s10, s12
-; GFX1064-NEXT:    s_mul_i32 s9, s9, s8
-; GFX1064-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX1064-NEXT:    s_add_i32 s9, s13, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s8, s4
-; GFX1064-NEXT:    s_add_i32 s9, s9, s10
-; GFX1064-NEXT:    s_mul_i32 s4, s8, s4
-; GFX1064-NEXT:    s_mul_i32 s14, s12, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s12, s9
-; GFX1064-NEXT:    s_add_u32 s5, s5, s14
+; GFX1064-NEXT:    s_add_u32 s5, s5, s10
+; GFX1064-NEXT:    s_addc_u32 s4, s4, s11
+; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s8, s8, s4
+; GFX1064-NEXT:    s_mul_i32 s9, s9, s5
+; GFX1064-NEXT:    s_add_i32 s8, s10, s8
+; GFX1064-NEXT:    s_mul_i32 s10, s4, s11
+; GFX1064-NEXT:    s_add_i32 s8, s8, s9
+; GFX1064-NEXT:    s_mul_hi_u32 s9, s5, s11
+; GFX1064-NEXT:    s_mul_i32 s14, s5, s8
+; GFX1064-NEXT:    s_mul_hi_u32 s13, s5, s8
+; GFX1064-NEXT:    s_add_u32 s9, s9, s14
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s4, s11
 ; GFX1064-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s9
-; GFX1064-NEXT:    s_add_u32 s4, s5, s4
-; GFX1064-NEXT:    s_mul_i32 s9, s8, s9
-; GFX1064-NEXT:    s_addc_u32 s4, s13, s11
-; GFX1064-NEXT:    s_addc_u32 s5, s10, 0
-; GFX1064-NEXT:    s_add_u32 s4, s4, s9
-; GFX1064-NEXT:    s_addc_u32 s9, 0, s5
-; GFX1064-NEXT:    s_add_u32 s10, s12, s4
-; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s2, s10
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_mul_hi_u32 s4, s3, s10
-; GFX1064-NEXT:    s_addc_u32 s5, s8, s9
-; GFX1064-NEXT:    s_mul_i32 s8, s3, s10
-; GFX1064-NEXT:    s_mul_i32 s10, s2, s5
-; GFX1064-NEXT:    s_mul_hi_u32 s9, s2, s5
-; GFX1064-NEXT:    s_add_u32 s10, s11, s10
-; GFX1064-NEXT:    s_addc_u32 s9, 0, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s3, s5
-; GFX1064-NEXT:    s_add_u32 s8, s10, s8
+; GFX1064-NEXT:    s_mul_hi_u32 s11, s4, s8
+; GFX1064-NEXT:    s_add_u32 s9, s9, s10
+; GFX1064-NEXT:    s_mul_i32 s8, s4, s8
+; GFX1064-NEXT:    s_addc_u32 s9, s13, s12
+; GFX1064-NEXT:    s_addc_u32 s10, s11, 0
+; GFX1064-NEXT:    s_add_u32 s8, s9, s8
+; GFX1064-NEXT:    s_addc_u32 s9, 0, s10
+; GFX1064-NEXT:    s_add_u32 s5, s5, s8
+; GFX1064-NEXT:    s_addc_u32 s4, s4, s9
+; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s2, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s9, s3, s5
 ; GFX1064-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1064-NEXT:    s_addc_u32 s4, s9, s4
+; GFX1064-NEXT:    s_add_u32 s8, s8, s11
+; GFX1064-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s3, s4
+; GFX1064-NEXT:    s_add_u32 s5, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s4, s3, s4
+; GFX1064-NEXT:    s_addc_u32 s5, s10, s9
 ; GFX1064-NEXT:    s_addc_u32 s8, s12, 0
-; GFX1064-NEXT:    s_add_u32 s10, s4, s5
+; GFX1064-NEXT:    s_add_u32 s10, s5, s4
 ; GFX1064-NEXT:    s_addc_u32 s11, 0, s8
 ; GFX1064-NEXT:    s_mul_hi_u32 s4, s0, s10
 ; GFX1064-NEXT:    s_mul_i32 s5, s0, s11
 ; GFX1064-NEXT:    s_mul_i32 s8, s1, s10
 ; GFX1064-NEXT:    s_add_i32 s4, s4, s5
-; GFX1064-NEXT:    s_add_i32 s12, s4, s8
+; GFX1064-NEXT:    s_add_i32 s8, s4, s8
 ; GFX1064-NEXT:    s_mul_i32 s4, s0, s10
-; GFX1064-NEXT:    s_sub_i32 s8, s3, s12
-; GFX1064-NEXT:    s_sub_u32 s13, s2, s4
+; GFX1064-NEXT:    s_sub_i32 s9, s3, s8
+; GFX1064-NEXT:    s_sub_u32 s12, s2, s4
 ; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_subb_u32 s14, s8, s1
-; GFX1064-NEXT:    s_sub_u32 s15, s13, s0
-; GFX1064-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1064-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX1064-NEXT:    s_subb_u32 s8, s14, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s8, s1
-; GFX1064-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s15, s0
+; GFX1064-NEXT:    s_subb_u32 s9, s9, s1
+; GFX1064-NEXT:    s_sub_u32 s13, s12, s0
+; GFX1064-NEXT:    s_subb_u32 s9, s9, 0
+; GFX1064-NEXT:    s_cmp_ge_u32 s9, s1
 ; GFX1064-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1064-NEXT:    s_cmp_eq_u32 s8, s1
-; GFX1064-NEXT:    s_cselect_b32 s8, s14, s9
-; GFX1064-NEXT:    s_add_u32 s9, s10, 1
+; GFX1064-NEXT:    s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX1064-NEXT:    s_cmp_eq_u32 s9, s1
+; GFX1064-NEXT:    s_cselect_b32 s9, s13, s14
+; GFX1064-NEXT:    s_add_u32 s13, s10, 1
 ; GFX1064-NEXT:    s_addc_u32 s14, s11, 0
 ; GFX1064-NEXT:    s_add_u32 s15, s10, 2
 ; GFX1064-NEXT:    s_addc_u32 s16, s11, 0
-; GFX1064-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX1064-NEXT:    s_cselect_b32 s15, s15, s9
+; GFX1064-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX1064-NEXT:    s_cselect_b32 s13, s15, s13
 ; GFX1064-NEXT:    s_cselect_b32 s14, s16, s14
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_subb_u32 s3, s3, s12
+; GFX1064-NEXT:    s_subb_u32 s3, s3, s8
 ; GFX1064-NEXT:    s_cmp_ge_u32 s3, s1
 ; GFX1064-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT:    s_cmp_ge_u32 s12, s0
 ; GFX1064-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX1064-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1064-NEXT:    s_cselect_b32 s1, s5, s4
 ; GFX1064-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1064-NEXT:    s_cselect_b32 s5, s14, s11
-; GFX1064-NEXT:    s_cselect_b32 s4, s15, s10
+; GFX1064-NEXT:    s_cselect_b32 s4, s13, s10
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX1064-NEXT:  .LBB15_2:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 64d055b..4445383 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX8-NEXT:    s_or_b32 s4, s12, s13
 ; DAGISEL-GFX8-NEXT:    s_or_b32 s4, s4, s14
-; DAGISEL-GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; DAGISEL-GFX8-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; DAGISEL-GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; DAGISEL-GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX942-NEXT:    s_or_b32 s0, s12, s13
 ; DAGISEL-GFX942-NEXT:    s_or_b32 s0, s0, s14
-; DAGISEL-GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; DAGISEL-GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; DAGISEL-GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; DAGISEL-GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX12-NEXT:    s_or_b32 s0, ttmp9, s0
 ; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-GFX12-NEXT:    s_or_b32 s0, s0, s1
-; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL-GFX12-NEXT:    s_cmp_lg_u32 s0, 0
 ; DAGISEL-GFX12-NEXT:    s_cselect_b32 s0, -1, 0
 ; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX8-NEXT:    s_or_b32 s4, s12, s13
 ; GISEL-GFX8-NEXT:    s_or_b32 s4, s4, s14
-; GISEL-GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX942-NEXT:    s_or_b32 s0, s12, s13
 ; GISEL-GFX942-NEXT:    s_or_b32 s0, s0, s14
-; GISEL-GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-GFX942-NEXT:    s_cselect_b32 s0, 1, 0
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GISEL-GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX12-NEXT:    s_or_b32 s0, ttmp9, s0
 ; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; GISEL-GFX12-NEXT:    s_or_b32 s0, s0, s1
-; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
-; GISEL-GFX12-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-GFX12-NEXT:    s_cselect_b32 s0, 1, 0
 ; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; GISEL-GFX12-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
new file mode 100644
index 0000000..df0cbeb
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; This IR is hand-written.
+
+; ModuleID = 'ptr-named-2.ll'
+source_filename = "ptr-named-2.ll"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel-unknown-none"
+
+%struct.TypeExamples = type { i32*, i32, i32, i32* }
+
+@type_examples = internal global %struct.TypeExamples zeroinitializer, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!2, !3, !4}
+!llvm.ident = !{!21}
+
+; CHECK-BTF:      [1] STRUCT 'TypeExamples' size=32 vlen=4
+; CHECK-BTF-NEXT:         'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT:         'volatile' type_id=4 bits_offset=64
+; CHECK-BTF-NEXT:         'const' type_id=5 bits_offset=128
+; CHECK-BTF-NEXT:         'restrict_ptr' type_id=6 bits_offset=192
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [4] VOLATILE '(anon)' type_id=3
+; CHECK-BTF-NEXT: [5] CONST '(anon)' type_id=3
+; CHECK-BTF-NEXT: [6] RESTRICT '(anon)' type_id=7
+; CHECK-BTF-NEXT: [7] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [8] VAR 'type_examples' type_id=1, linkage=static
+; CHECK-BTF-NEXT: [9] DATASEC '.bss' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=8 offset=0 size=24
+
+!0 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, globals: !8, splitDebugInlining: false, nameTableKind: None)
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = distinct !DIGlobalVariable(name: "type_examples", scope: !1, file: !6, line: 12, type: !9, isLocal: true, isDefinition: true)
+!6 = !DIFile(filename: "ptr-named-2.ll", directory: "/tmp")
+!7 = !{}
+!8 = !{!0}
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TypeExamples", file: !6, line: 5, size: 256, elements: !10)
+!10 = !{!11, !12, !13, !14}
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !9, file: !6, line: 6, baseType: !15, size: 64)
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "volatile", scope: !9, file: !6, line: 7, baseType: !17, size: 64, offset: 64)
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "const", scope: !9, file: !6, line: 8, baseType: !18, size: 64, offset: 128)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "restrict_ptr", scope: !9, file: !6, line: 9, baseType: !19, size: 64, offset: 192)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*int", baseType: !16, size: 64)
+!16 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!17 = !DIDerivedType(tag: DW_TAG_volatile_type, name: "volatile int", baseType: !16)
+!18 = !DIDerivedType(tag: DW_TAG_const_type, name: "const int", baseType: !16)
+!19 = !DIDerivedType(tag: DW_TAG_restrict_type, name: "*int restrict", baseType: !20)
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64)
+!21 = !{!"my hand-written IR"}
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
new file mode 100644
index 0000000..675c34e
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+;   #![no_std]
+;   #![no_main]
+;
+;   pub struct MyType {
+;       ptr: *const u32,
+;   }
+;
+;   impl MyType {
+;       pub const fn new() -> Self {
+;           let ptr = core::ptr::null();
+;           Self { ptr }
+;       }
+;   }
+;
+;   unsafe impl Sync for MyType {}
+;
+;   #[unsafe(no_mangle)]
+;   pub static X: MyType = MyType::new();
+;
+;   #[cfg(not(test))]
+;   #[panic_handler]
+;   fn panic(_info: &core::panic::PanicInfo) -> ! {
+;       loop {}
+;   }
+; Compilation flag:
+;   cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+;   llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o ptr-named.bc
+;   llvm-dis ptr-named.bc -o ptr-named.ll
+
+; ModuleID = 'ptr-named.bc'
+source_filename = "1m2uqe50qkwxmo53ydydvou91"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [8 x i8] zeroinitializer, align 8, !dbg !0
+
+!llvm.module.flags = !{!11, !12, !13, !14}
+!llvm.ident = !{!15}
+!llvm.dbg.cu = !{!16}
+
+; CHECK-BTF:      [1] STRUCT 'MyType' size=8 vlen=1
+; CHECK-BTF-NEXT:         'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [4] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [5] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=4 offset=0 size=8
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 19, type: !4, isLocal: false, isDefinition: true, align: 64)
+!2 = !DINamespace(name: "ptr_named", scope: null)
+!3 = !DIFile(filename: "ptr-named/src/main.rs", directory: "/tmp/ptr-named", checksumkind: CSK_MD5, checksum: "e37168304600b30cbb5ba168f0384932")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyType", scope: !2, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !6, templateParams: !10, identifier: "7609fa40332dd486922f074276a171c3")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !4, file: !5, baseType: !8, size: 64, align: 64, flags: DIFlagPrivate)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*const u32", baseType: !9, size: 64, align: 64, dwarfAddressSpace: 0)
+!9 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!10 = !{}
+!11 = !{i32 8, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{i32 7, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 3}
+!15 = !{!"rustc version 1.92.0-nightly (c8905eaa6 2025-09-28)"}
+!16 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !17, producer: "clang LLVM (rustc version 1.92.0-nightly (c8905eaa6 2025-09-28))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !18, splitDebugInlining: false, nameTableKind: None)
+!17 = !DIFile(filename: "ptr-named/src/main.rs/@/1m2uqe50qkwxmo53ydydvou91", directory: "/tmp/ptr-named")
+!18 = !{!0}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
index 4f13f47..56798c8 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
@@ -28,6 +28,11 @@ define void @test() {
       @llvm.dx.resource.handlefrombinding(i32 0, i32 10, i32 1, i32 0, ptr @SB.str)
   ; CHECK: %"StructuredBuffer<struct.S>" = type { %struct.S }
 
+  ; StructuredBuffer<float[3][2]>
+  %struct1 = call target("dx.RawBuffer", [3 x [2 x float]], 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 12, i32 1, i32 0, ptr null)
+  ; CHECK: %"StructuredBuffer<float[3][2]>" = type { [3 x [2 x float]] }
+
   ; ByteAddressBuffer
   %byteaddr = call target("dx.RawBuffer", i8, 0, 0)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 20, i32 1, i32 0, ptr null)
@@ -40,12 +45,14 @@ define void @test() {
 ; CHECK-NEXT: @[[T1:.*]] = external constant %"Buffer<int32_t>"
 ; CHECK-NEXT: @[[T2:.*]] = external constant %"Buffer<uint32_t3>"
 ; CHECK-NEXT: @[[S0:.*]] = external constant %"StructuredBuffer<struct.S>"
+; CHECK-NEXT: @[[S1:.*]] = external constant %"StructuredBuffer<float[3][2]>"
 ; CHECK-NEXT: @[[B0:.*]] = external constant %ByteAddressBuffer
 
 ; CHECK: !{i32 0, ptr @[[T0]], !"A"
 ; CHECK: !{i32 1, ptr @[[T1]], !""
 ; CHECK: !{i32 2, ptr @[[T2]], !""
 ; CHECK: !{i32 3, ptr @[[S0]], !"SB"
-; CHECK: !{i32 4, ptr @[[B0]], !""
+; CHECK: !{i32 4, ptr @[[S1]], !""
+; CHECK: !{i32 5, ptr @[[B0]], !""
 
 attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/Hexagon/swp-phi.ll b/llvm/test/CodeGen/Hexagon/swp-phi.ll
index 9b0e126..6ce2481 100644
--- a/llvm/test/CodeGen/Hexagon/swp-phi.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-phi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -enable-unsafe-fp-math -enable-pipeliner \
+; RUN: llc -mtriple=hexagon -enable-pipeliner \
 ; RUN:     -pipeliner-prune-deps=false -stats -o /dev/null < %s
 ; REQUIRES: asserts
 
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
index 2894fff..da0d13d 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
@@ -1,5 +1,5 @@
 {
-    "entities" : {
+    "Opcodes" : {
         "ABS_Fp":[1, 2],
         "ADC":[3, 4],
         "ADD":[5, 6],
@@ -7,5 +7,21 @@
         "ADDPDrr":[9, 10],
         "ADDPSrr":[11, 12],
         "ADDSDrm":[13, 14]
+    },
+    "CommonOperands": {
+        "Immediate": [0.1, 0.1],
+        "MBB": [0.2, 0.2],
+        "FrameIndex": [0.3, 0.3],
+        "GlobalAddress": [0.4, 0.4]
+    },
+    "PhysicalRegisters": {
+        "GR32": [0.5, 0.5],
+        "GR64": [0.6, 0.6],
+        "XMM": [0.7, 0.7]
+    },
+    "VirtualRegisters": {
+        "GR32": [0.8, 0.8],
+        "GR64": [0.9, 0.9],
+        "XMM": [1.0, 1.0]
     }
 }
 \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
index 5de715b..f4b14a4 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
@@ -1,5 +1,5 @@
 {
-  "entities": {
+  "Opcodes": {
     "KILL": [0.1, 0.2, 0.3],
     "MOV": [0.4, 0.5, 0.6],
     "LEA": [0.7, 0.8, 0.9],
@@ -18,5 +18,21 @@
     "POP": [4.6, 4.7, 4.8],
     "NOP": [4.9, 5.0, 5.1],
     "COPY": [5.2, 5.3, 5.4]
+  },
+  "CommonOperands": {
+    "Immediate": [0.1, 0.1, 0.1],
+    "MBB": [0.2, 0.2, 0.2],
+    "FrameIndex": [0.3, 0.3, 0.3],
+    "GlobalAddress": [0.4, 0.4, 0.4]
+  },
+  "PhysicalRegisters": {
+    "GR32": [0.5, 0.5, 0.5],
+    "GR64": [0.6, 0.6, 0.6],
+    "XMM": [0.7, 0.7, 0.7]
+  },
+  "VirtualRegisters": {
+    "GR32": [0.8, 0.8, 0.8],
+    "GR64": [0.9, 0.9, 0.9],
+    "XMM": [1.0, 1.0, 1.0]
   }
 }
 \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
index bf04163..6274fb7 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
@@ -1,7 +1,16 @@
 {
-    "entities": {
+    "Opcodes": {
         "ADD": [1.0, 2.0, 3.0],
         "SUB": [1.5],
         "MUL": [2.0, 3.0]
+    },
+    "CommonOperands": {
+        "Immediate": [1.0]
+    },
+    "PhysicalRegisters": {
+        "GR32": [1.0, 2.0]
+    },
+    "VirtualRegisters": {
+        "GR32": [1.0, 2.0, 3.0]
     }
 }
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
index 63e8ccbd..7bfdf3b 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
@@ -1,5 +1,5 @@
 {
-    "entities": {
+    "Opcodes": {
         "ADD": [],
         "SUB": [],
         "MUL": [],
@@ -8,5 +8,14 @@
         "JMP": [],
         "CALL": [],
         "RET": []
+    },
+    "CommonOperands": {
+        "Immediate": []
+    },
+    "PhysicalRegisters": {
+        "GR32": []
+    },
+    "VirtualRegisters": {
+        "GR32": []
     }
 }
 \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index 6327cff..d3c0da9 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA:  [ 0.00  0.00 ]
 Key: XSTORE:  [ 0.00  0.00 ]
 Key: XSUSLDTRK:  [ 0.00  0.00 ]
 Key: XTEST:  [ 0.00  0.00 ]
+Key: Immediate:  [ 0.10  0.10 ]
+Key: CImmediate:  [ 0.00  0.00 ]
+Key: FPImmediate:  [ 0.00  0.00 ]
+Key: MBB:  [ 0.20  0.20 ]
+Key: FrameIndex:  [ 0.30  0.30 ]
+Key: ConstantPoolIndex:  [ 0.00  0.00 ]
+Key: TargetIndex:  [ 0.00  0.00 ]
+Key: JumpTableIndex:  [ 0.00  0.00 ]
+Key: ExternalSymbol:  [ 0.00  0.00 ]
+Key: GlobalAddress:  [ 0.40  0.40 ]
+Key: BlockAddress:  [ 0.00  0.00 ]
+Key: RegisterMask:  [ 0.00  0.00 ]
+Key: RegisterLiveOut:  [ 0.00  0.00 ]
+Key: Metadata:  [ 0.00  0.00 ]
+Key: MCSymbol:  [ 0.00  0.00 ]
+Key: CFIIndex:  [ 0.00  0.00 ]
+Key: IntrinsicID:  [ 0.00  0.00 ]
+Key: Predicate:  [ 0.00  0.00 ]
+Key: ShuffleMask:  [ 0.00  0.00 ]
+Key: PhyReg_GR8:  [ 0.00  0.00 ]
+Key: PhyReg_GRH8:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: PhyReg_GRH16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK1:  [ 0.00  0.00 ]
+Key: PhyReg_VK16:  [ 0.00  0.00 ]
+Key: PhyReg_VK2:  [ 0.00  0.00 ]
+Key: PhyReg_VK4:  [ 0.00  0.00 ]
+Key: PhyReg_VK8:  [ 0.00  0.00 ]
+Key: PhyReg_VK16WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK2WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK4WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK8WM:  [ 0.00  0.00 ]
+Key: PhyReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_FPCCR:  [ 0.00  0.00 ]
+Key: PhyReg_FR16X:  [ 0.00  0.00 ]
+Key: PhyReg_FR16:  [ 0.00  0.00 ]
+Key: PhyReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_FR32X:  [ 0.00  0.00 ]
+Key: PhyReg_GR32:  [ 0.50  0.50 ]
+Key: PhyReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_RFP32:  [ 0.00  0.00 ]
+Key: PhyReg_VK32WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_CCR:  [ 0.00  0.00 ]
+Key: PhyReg_DFCCR:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_RFP64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64:  [ 0.60  0.60 ]
+Key: PhyReg_FR64X:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK64:  [ 0.00  0.00 ]
+Key: PhyReg_VR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_VK64WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_A:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_RST:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80_7:  [ 0.00  0.00 ]
+Key: PhyReg_VR128X:  [ 0.00  0.00 ]
+Key: PhyReg_VR128:  [ 0.00  0.00 ]
+Key: PhyReg_VR256X:  [ 0.00  0.00 ]
+Key: PhyReg_VR256:  [ 0.00  0.00 ]
+Key: PhyReg_VR512:  [ 0.00  0.00 ]
+Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: PhyReg_TILE:  [ 0.00  0.00 ]
+Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
+Key: VirtReg_GR8:  [ 0.00  0.00 ]
+Key: VirtReg_GRH8:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: VirtReg_GRH16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK1:  [ 0.00  0.00 ]
+Key: VirtReg_VK16:  [ 0.00  0.00 ]
+Key: VirtReg_VK2:  [ 0.00  0.00 ]
+Key: VirtReg_VK4:  [ 0.00  0.00 ]
+Key: VirtReg_VK8:  [ 0.00  0.00 ]
+Key: VirtReg_VK16WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK2WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK4WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK8WM:  [ 0.00  0.00 ]
+Key: VirtReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_FPCCR:  [ 0.00  0.00 ]
+Key: VirtReg_FR16X:  [ 0.00  0.00 ]
+Key: VirtReg_FR16:  [ 0.00  0.00 ]
+Key: VirtReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_FR32X:  [ 0.00  0.00 ]
+Key: VirtReg_GR32:  [ 0.80  0.80 ]
+Key: VirtReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_RFP32:  [ 0.00  0.00 ]
+Key: VirtReg_VK32WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_CCR:  [ 0.00  0.00 ]
+Key: VirtReg_DFCCR:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_RFP64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64:  [ 0.90  0.90 ]
+Key: VirtReg_FR64X:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK64:  [ 0.00  0.00 ]
+Key: VirtReg_VR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_VK64WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_A:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_RST:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80_7:  [ 0.00  0.00 ]
+Key: VirtReg_VR128X:  [ 0.00  0.00 ]
+Key: VirtReg_VR128:  [ 0.00  0.00 ]
+Key: VirtReg_VR256X:  [ 0.00  0.00 ]
+Key: VirtReg_VR256:  [ 0.00  0.00 ]
+Key: VirtReg_VR512:  [ 0.00  0.00 ]
+Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: VirtReg_TILE:  [ 0.00  0.00 ]
+Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index 4409e6d..c6e5508 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA:  [ 0.00  0.00 ]
 Key: XSTORE:  [ 0.00  0.00 ]
 Key: XSUSLDTRK:  [ 0.00  0.00 ]
 Key: XTEST:  [ 0.00  0.00 ]
+Key: Immediate:  [ 0.10  0.10 ]
+Key: CImmediate:  [ 0.00  0.00 ]
+Key: FPImmediate:  [ 0.00  0.00 ]
+Key: MBB:  [ 0.20  0.20 ]
+Key: FrameIndex:  [ 0.30  0.30 ]
+Key: ConstantPoolIndex:  [ 0.00  0.00 ]
+Key: TargetIndex:  [ 0.00  0.00 ]
+Key: JumpTableIndex:  [ 0.00  0.00 ]
+Key: ExternalSymbol:  [ 0.00  0.00 ]
+Key: GlobalAddress:  [ 0.40  0.40 ]
+Key: BlockAddress:  [ 0.00  0.00 ]
+Key: RegisterMask:  [ 0.00  0.00 ]
+Key: RegisterLiveOut:  [ 0.00  0.00 ]
+Key: Metadata:  [ 0.00  0.00 ]
+Key: MCSymbol:  [ 0.00  0.00 ]
+Key: CFIIndex:  [ 0.00  0.00 ]
+Key: IntrinsicID:  [ 0.00  0.00 ]
+Key: Predicate:  [ 0.00  0.00 ]
+Key: ShuffleMask:  [ 0.00  0.00 ]
+Key: PhyReg_GR8:  [ 0.00  0.00 ]
+Key: PhyReg_GRH8:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: PhyReg_GRH16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK1:  [ 0.00  0.00 ]
+Key: PhyReg_VK16:  [ 0.00  0.00 ]
+Key: PhyReg_VK2:  [ 0.00  0.00 ]
+Key: PhyReg_VK4:  [ 0.00  0.00 ]
+Key: PhyReg_VK8:  [ 0.00  0.00 ]
+Key: PhyReg_VK16WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK2WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK4WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK8WM:  [ 0.00  0.00 ]
+Key: PhyReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_FPCCR:  [ 0.00  0.00 ]
+Key: PhyReg_FR16X:  [ 0.00  0.00 ]
+Key: PhyReg_FR16:  [ 0.00  0.00 ]
+Key: PhyReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_FR32X:  [ 0.00  0.00 ]
+Key: PhyReg_GR32:  [ 0.50  0.50 ]
+Key: PhyReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_RFP32:  [ 0.00  0.00 ]
+Key: PhyReg_VK32WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_CCR:  [ 0.00  0.00 ]
+Key: PhyReg_DFCCR:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_RFP64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64:  [ 0.60  0.60 ]
+Key: PhyReg_FR64X:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK64:  [ 0.00  0.00 ]
+Key: PhyReg_VR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_VK64WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_A:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_RST:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80_7:  [ 0.00  0.00 ]
+Key: PhyReg_VR128X:  [ 0.00  0.00 ]
+Key: PhyReg_VR128:  [ 0.00  0.00 ]
+Key: PhyReg_VR256X:  [ 0.00  0.00 ]
+Key: PhyReg_VR256:  [ 0.00  0.00 ]
+Key: PhyReg_VR512:  [ 0.00  0.00 ]
+Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: PhyReg_TILE:  [ 0.00  0.00 ]
+Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
+Key: VirtReg_GR8:  [ 0.00  0.00 ]
+Key: VirtReg_GRH8:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: VirtReg_GRH16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK1:  [ 0.00  0.00 ]
+Key: VirtReg_VK16:  [ 0.00  0.00 ]
+Key: VirtReg_VK2:  [ 0.00  0.00 ]
+Key: VirtReg_VK4:  [ 0.00  0.00 ]
+Key: VirtReg_VK8:  [ 0.00  0.00 ]
+Key: VirtReg_VK16WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK2WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK4WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK8WM:  [ 0.00  0.00 ]
+Key: VirtReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_FPCCR:  [ 0.00  0.00 ]
+Key: VirtReg_FR16X:  [ 0.00  0.00 ]
+Key: VirtReg_FR16:  [ 0.00  0.00 ]
+Key: VirtReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_FR32X:  [ 0.00  0.00 ]
+Key: VirtReg_GR32:  [ 0.80  0.80 ]
+Key: VirtReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_RFP32:  [ 0.00  0.00 ]
+Key: VirtReg_VK32WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_CCR:  [ 0.00  0.00 ]
+Key: VirtReg_DFCCR:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_RFP64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64:  [ 0.90  0.90 ]
+Key: VirtReg_FR64X:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK64:  [ 0.00  0.00 ]
+Key: VirtReg_VR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_VK64WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_A:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_RST:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80_7:  [ 0.00  0.00 ]
+Key: VirtReg_VR128X:  [ 0.00  0.00 ]
+Key: VirtReg_VR128:  [ 0.00  0.00 ]
+Key: VirtReg_VR256X:  [ 0.00  0.00 ]
+Key: VirtReg_VR256:  [ 0.00  0.00 ]
+Key: VirtReg_VR512:  [ 0.00  0.00 ]
+Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: VirtReg_TILE:  [ 0.00  0.00 ]
+Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/if-else.mir b/llvm/test/CodeGen/MIR2Vec/if-else.mir
index 5734a23..f2572f5 100644
--- a/llvm/test/CodeGen/MIR2Vec/if-else.mir
+++ b/llvm/test/CodeGen/MIR2Vec/if-else.mir
@@ -135,10 +135,10 @@ body:             |
 
 # CHECK: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: abc:entry:
-# CHECK-NEXT:  [ 16.50  17.10  17.70 ]
+# CHECK-NEXT:  [ 23.60  24.20  24.80 ]
 # CHECK-NEXT: Machine basic block: abc:if.then:
-# CHECK-NEXT:  [ 4.50  4.80  5.10 ]
+# CHECK-NEXT:  [ 7.30  7.60  7.90 ]
 # CHECK-NEXT: Machine basic block: abc:if.else:
-# CHECK-NEXT:  [ 0.80  1.00  1.20 ]
+# CHECK-NEXT:  [ 3.40  3.60  3.80 ]
 # CHECK-NEXT: Machine basic block: abc:return:
-# CHECK-NEXT:  [ 6.60  6.90  7.20 ]
-\ No newline at end of file
+# CHECK-NEXT:  [ 8.80  9.10  9.40 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
index 338cb63..0fdcc81 100644
--- a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
+++ b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
@@ -48,29 +48,29 @@ body:             |
     RET 0
 
 # CHECK: MIR2Vec embeddings for machine function add_function:
-# CHECK: Function vector:  [ 19.20  19.80  20.40 ]
+# CHECK: Function vector:  [ 26.50  27.10  27.70 ]
 # CHECK-NEXT: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: add_function:entry:
-# CHECK-NEXT:  [ 19.20  19.80  20.40 ]
+# CHECK-NEXT:  [ 26.50  27.10  27.70 ]
 # CHECK-NEXT: Machine instruction vectors:
 # CHECK-NEXT: Machine instruction: %1:gr32 = COPY $esi
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: %0:gr32 = COPY $edi
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: %2:gr32 = nsw ADD32rr %0:gr32(tied-def 0), %1:gr32, implicit-def dead $eflags
-# CHECK-NEXT:  [ 1.30  1.40  1.50 ]
+# CHECK-NEXT:  [ 3.70  3.80  3.90 ]
 # CHECK-NEXT: Machine instruction: %3:gr32 = ADD32rr %2:gr32(tied-def 0), %2:gr32, implicit-def dead $eflags
-# CHECK-NEXT:  [ 1.30  1.40  1.50 ]
+# CHECK-NEXT:  [ 3.70  3.80  3.90 ]
 # CHECK-NEXT: Machine instruction: $eax = COPY %3:gr32
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: RET 0, $eax
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
 
 # CHECK: MIR2Vec embeddings for machine function simple_function:
-# CHECK-NEXT:Function vector:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:Function vector:  [ 1.10  1.20  1.30 ]
 # CHECK-NEXT: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: simple_function:entry:
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
 # CHECK-NEXT: Machine instruction vectors:
 # CHECK-NEXT: Machine instruction: RET 0
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
-\ No newline at end of file
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
index c6554bc..13e908e 100644
--- a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
+++ b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
@@ -10,6 +10,6 @@ define dso_local void @test() {
 }
 
 ; CHECK-INVALID: MIR2Vec Vocabulary Printer: Failed to get vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path
-; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'entities' section of the vocabulary is zero
-; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'entities' section in vocabulary file
-; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'entities' section of the vocabulary are not of the same dimension
+; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'Opcodes' section of the vocabulary is zero
+; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'Opcodes' section in vocabulary file
+; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'Opcodes' section of the vocabulary are not of the same dimension
diff --git a/llvm/test/CodeGen/NVPTX/fma-assoc.ll b/llvm/test/CodeGen/NVPTX/fma-assoc.ll
index 6693c90..db0eae7 100644
--- a/llvm/test/CodeGen/NVPTX/fma-assoc.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-assoc.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | %ptxas-verify %}
 
 define ptx_device float @t1_f32(float %x, float %y, float %z,
 ; CHECK-UNSAFE-LABEL: t1_f32(
diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
index cad684e..baa127e 100644
--- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
+++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
@@ -2,8 +2,8 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1                        | FileCheck %s --check-prefix=FMFDEBUG
 ; RUN: llc < %s -mtriple=powerpc64le                                                           | FileCheck %s --check-prefix=FMF
-; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG
-; RUN: llc < %s -mtriple=powerpc64le -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL
+; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG
+; RUN: llc < %s -mtriple=powerpc64le -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL
 
 ; Test FP transforms using instruction/node-level fast-math-flags.
 ; We're also checking debug output to verify that FMF is propagated to the newly created nodes.
diff --git a/llvm/test/CodeGen/PowerPC/scalar-equal.ll b/llvm/test/CodeGen/PowerPC/scalar-equal.ll
index 1832475..c0b11b4 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-equal.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-equal.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
 ; RUN:   --enable-no-nans-fp-math --enable-no-infs-fp-math \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
 ; RUN:   --check-prefix=FAST-P8
-; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
+; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
 ; RUN:   --enable-no-nans-fp-math --enable-no-infs-fp-math \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
diff --git a/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll b/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll
index ca9baceb..5915bd3 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
+; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
 ; RUN:   --enable-no-nans-fp-math \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
diff --git a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
index fd0b494..881d1f4 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
 ; RUN:   --enable-no-nans-fp-math --enable-no-infs-fp-math \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
 ; RUN:   --check-prefix=FAST-P8
-; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
+; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
 ; RUN:   --enable-no-nans-fp-math --enable-no-infs-fp-math \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
index 4d32e66..6d41875 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
-; Test that uses of cbuffer members inside ConstantExprs are handled correctly.
+; Test that uses of cbuffer members are handled correctly.
 
 ; CHECK-DAG: OpDecorate %[[MyCBuffer:[0-9]+]] DescriptorSet 0
 ; CHECK-DAG: OpDecorate %[[MyCBuffer]] Binding 0
@@ -37,10 +37,8 @@ entry:
 ; CHECK: %[[tmp_ptr:[0-9]+]] = OpAccessChain {{%[0-9]+}} %[[tmp]] %[[uint_0]] %[[uint_0]]
 ; CHECK: %[[v_ptr:.+]] = OpAccessChain %[[_ptr_Uniform_v4float]] %[[tmp]] %[[uint_0]] %[[uint_1]]
 ; CHECK: %[[s_ptr_gep:[0-9]+]] = OpInBoundsAccessChain %[[_ptr_Uniform_float]] %[[tmp_ptr]] %[[uint_0]] %[[uint_1]]
-  %gep = getelementptr inbounds %MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1
-
 ; CHECK: %[[s_val:.+]] = OpLoad %[[float]] %[[s_ptr_gep]]
-  %load_from_gep = load float, ptr addrspace(12) %gep, align 4
+  %load_from_gep = load float, ptr addrspace(12) getelementptr inbounds (%MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1), align 4
 
 ; CHECK: %[[v_val:.+]] = OpLoad %[[v4float]] %[[v_ptr]]
   %load_v = load <4 x float>, ptr addrspace(12) @v, align 16
diff --git a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
index 4a38d7a..c87f113 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
@@ -1,7 +1,7 @@
 ; Test that combined sin/cos library call is emitted when appropriate
 
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
-; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
 
 define float @f1(float %x) {
 ; CHECK-OPT-LABEL: f1:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir
new file mode 100644
index 0000000..5783133
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir
@@ -0,0 +1,146 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s
+
+# From bug #162644. The _wrong_ output of this test is to generate the
+# body of the tail-predicated loop like this:
+#
+#     $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2
+#     renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4)
+#     $q0 = MVE_VORR $q1, $q1, 0, $noreg, $noreg, undef $q0
+#     renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0
+#     $lr = MVE_LETP killed renamable $lr, %bb.1
+#
+# in which the second MVE_VORR, copying q1 into q0, is an invalid conversion of
+# the input MQPRCopy, because it won't copy the vector lanes disabled by
+# FPSCR.LTPSIZE, and those are needed in the output value of the loop.
+#
+# In the right output, that MQPRCopy is expanded into a pair of VMOVD copying
+# d2,d3 into d0,d1 respectively, which are unaffected by LTPSIZE.
+
+--- |
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main-unknown-none-eabihf"
+
+  @inactive = dso_local local_unnamed_addr global <4 x float> zeroinitializer, align 16
+
+  define <4 x float> @test_func(ptr %0, i32 %1) {
+    %3 = load <4 x float>, ptr @inactive, align 16
+    %4 = add i32 %1, 3
+    %5 = call i32 @llvm.smin.i32(i32 %1, i32 4)
+    %6 = sub i32 %4, %5
+    %7 = lshr i32 %6, 2
+    %8 = add nuw nsw i32 %7, 1
+    %9 = call i32 @llvm.start.loop.iterations.i32(i32 %8)
+    br label %10
+
+  10:                                               ; preds = %10, %2
+    %11 = phi <4 x float> [ splat (float 0x3FB99999A0000000), %2 ], [ %17, %10 ]
+    %12 = phi i32 [ %1, %2 ], [ %19, %10 ]
+    %13 = phi ptr [ %0, %2 ], [ %18, %10 ]
+    %14 = phi i32 [ %9, %2 ], [ %20, %10 ]
+    %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+    %16 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %13, i32 4, <4 x i1> %15, <4 x float> zeroinitializer)
+    %17 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %11, <4 x float> %16, <4 x i1> %15, <4 x float> %3)
+    %18 = getelementptr inbounds nuw i8, ptr %13, i32 16
+    %19 = add i32 %12, -4
+    %20 = call i32 @llvm.loop.decrement.reg.i32(i32 %14, i32 1)
+    %21 = icmp ne i32 %20, 0
+    br i1 %21, label %10, label %22
+
+  22:                                               ; preds = %10
+    ret <4 x float> %17
+  }
+...
+---
+name:            test_func
+alignment:       4
+legalized:       false
+tracksRegLiveness: true
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  ; CHECK-LABEL: name: test_func
+  ; CHECK: bb.0 (%ir-block.2):
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r1, $r7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK-NEXT:   $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive)
+  ; CHECK-NEXT:   $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0
+  ; CHECK-NEXT:   $lr = MVE_DLSTP_32 killed renamable $r1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.10, align 4):
+  ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK-NEXT:   liveins: $lr, $d2, $d3, $q0, $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2
+  ; CHECK-NEXT:   renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4)
+  ; CHECK-NEXT:   $d0 = VMOVD $d2, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $d1 = VMOVD $d3, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0
+  ; CHECK-NEXT:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2 (%ir-block.22):
+  ; CHECK-NEXT:   liveins: $q0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0
+  bb.0 (%ir-block.2):
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r7, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg
+    tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg
+    renamable $r3 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+    renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive)
+    $r2 = tMOVr $r1, 14 /* CC::al */, $noreg
+    t2IT 10, 8, implicit-def $itstate
+    renamable $r2 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r2, implicit killed $itstate
+    renamable $r2, dead $cpsr = tSUBrr renamable $r1, killed renamable $r2, 14 /* CC::al */, $noreg
+    renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 3, 14 /* CC::al */, $noreg
+    renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
+    $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg
+    $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0
+    renamable $lr = t2DoLoopStartTP killed renamable $r2, renamable $r1
+
+  bb.1 (%ir-block.10, align 4):
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $q1, $r0, $r1
+
+    renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg, $noreg
+    $q2 = MQPRCopy killed $q0
+    MVE_VPST 8, implicit $vpr
+    renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr, renamable $lr :: (load unknown-size from %ir.13, align 4)
+    $q0 = MQPRCopy $q1
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 1, killed renamable $vpr, renamable $lr, killed renamable $q0
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2 (%ir-block.22):
+    liveins: $q0
+
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0
+...
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll
index 053d6a1..d741411 100644
--- a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll
@@ -94,5 +94,5 @@ attributes #1 = { minsize nofree norecurse nounwind optsize }
 !llvm.module.flags = !{!0, !1, !2}
 
 !0 = !{i32 8, !"branch-target-enforcement", i32 0}
-!1 = !{i32 8, !"sign-return-address", i32 1}
+!1 = !{i32 8, !"sign-return-address", i32 2}
 !2 = !{i32 8, !"sign-return-address-all", i32 0}
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 94efe0f..104ec31 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -5,6 +5,7 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
 %struct.TwoInts = type { i32, i32 }
 %struct.ThreeInts = type { i32, i32, i32 }
 %struct.FourInts = type { i32, i32, i32, i32 }
+%struct.TwoShorts = type { i16, i16 }
 %struct.ThreeShorts = type { i16, i16, i16 }
 %struct.FourShorts = type { i16, i16, i16, i16 }
 %struct.FiveShorts = type { i16, i16, i16, i16, i16 }
@@ -12,6 +13,8 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
 %struct.ThreeBytes = type { i8, i8, i8 }
 %struct.FourBytes = type { i8, i8, i8, i8 }
 %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.TwoFloats = type { float, float }
+%struct.FourFloats = type { float, float, float, float }
 
 ; CHECK-LABEL: two_ints_same_op:
 ; CHECK: loop
@@ -1536,3 +1539,1608 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
 34:                                               ; preds = %6, %4
   ret void
 }
+
+; CHECK-LABEL: two_floats_same_op:
+; CHECK-NOT: f32x4.mul
+define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_vary_op:
+; CHECK-NOT: f32x4
+define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp20.not = icmp eq i32 %N, 0
+  br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %inc = add nuw i32 %i.021, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s	
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_vary_op:
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store64_lane
+define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store64_lane
+define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: f32x4.mul
+; CHECK: v128.store
+define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %mul14 = fmul float %4, %5
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul14, ptr %z16, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %mul20 = fmul float %6, %7
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %mul20, ptr %w22, align 4
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_vary_op:
+; CHECK-NOT: f32x4
+define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp42.not = icmp eq i32 %N, 0
+  br i1 %cmp42.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z12, align 4
+  %mul = fmul float %4, %5
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul, ptr %z14, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w17, align 4
+  %div = fdiv float %6, %7
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %div, ptr %w19, align 4
+  %inc = add nuw i32 %i.043, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv15 = sitofp i8 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z17, align 1
+  %conv18 = sitofp i8 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv23 = sitofp i8 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w25, align 1
+  %conv26 = sitofp i8 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.div
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv14 = sitofp i8 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z16, align 1
+  %conv17 = sitofp i8 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv21 = sitofp i8 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w23, align 1
+  %conv24 = sitofp i8 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i8
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv16, ptr %z18, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i8
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv23, ptr %w25, align 1
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.div
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i8
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv14, ptr %z16, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i8
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv20, ptr %w22, align 1
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv15 = sitofp i16 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z17, align 2
+  %conv18 = sitofp i16 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv23 = sitofp i16 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w25, align 2
+  %conv26 = sitofp i16 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.div
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv14 = sitofp i16 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z16, align 2
+  %conv17 = sitofp i16 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv21 = sitofp i16 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w23, align 2
+  %conv24 = sitofp i16 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i16
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv16, ptr %z18, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i16
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv23, ptr %w25, align 2
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.div
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i16
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv14, ptr %z16, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i16
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv20, ptr %w22, align 2
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
new file mode 100644
index 0000000..45f4ddd
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s
+
+; Test that fmaxnum and fmaximumnum get transformed to relaxed_max
+
+target triple = "wasm32"
+
+define <4 x float> @test_maxnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_maxnum_f32x4:
+; CHECK:         .functype test_maxnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_maximumnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_maximumnum_f32x4:
+; CHECK:         .functype test_maximumnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <2 x double> @test_maxnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_maxnum_f64x2:
+; CHECK:         .functype test_maxnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minimumnum_f64x2:
+; CHECK:         .functype test_minimumnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.maximumnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
new file mode 100644
index 0000000..f3eec02
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s
+
+; Test that fminnum and fminimumnum get transformed to relaxed_min
+
+target triple = "wasm32"
+
+define <4 x float> @test_minnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_minnum_f32x4:
+; CHECK:         .functype test_minnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_minimumnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_minimumnum_f32x4:
+; CHECK:         .functype test_minimumnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <2 x double> @test_minnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minnum_f64x2:
+; CHECK:         .functype test_minnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minimumnum_f64x2:
+; CHECK:         .functype test_minimumnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.fminimumnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.fminimumnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
index 123438d..f58456b 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
@@ -94,6 +94,19 @@ entry:
   ret <16 x i8> %0
 }
 
+define <8 x i8>  @trunc8i16_8i8(<8 x i16> %a) {
+; CHECK-LABEL: trunc8i16_8i8:
+; CHECK:         .functype trunc8i16_8i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %0 = trunc <8 x i16> %a to <8 x i8>
+  ret <8 x i8> %0
+}
+
 define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
 ; CHECK-LABEL: trunc8i64_8i16:
 ; CHECK:         .functype trunc8i64_8i16 (v128, v128, v128, v128) -> (v128)
@@ -139,3 +152,29 @@ entry:
   %0 = trunc <8 x i32> %a to <8 x i16>
   ret <8 x i16> %0
 }
+
+define <4 x i16> @trunc4i32_4i16(<4 x i32> %a) {
+; CHECK-LABEL: trunc4i32_4i16:
+; CHECK:         .functype trunc4i32_4i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %0 = trunc <4 x i32> %a to <4 x i16>
+  ret <4 x i16> %0
+}
+
+define <4 x i8> @trunc4i32_4i8(<4 x i32> %a) {
+; CHECK-LABEL: trunc4i32_4i8:
+; CHECK:         .functype trunc4i32_4i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %0 = trunc <4 x i32> %a to <4 x i8>
+  ret <4 x i8> %0
+}
diff --git a/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll b/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
index bea11e9..940fe8c 100644
--- a/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
+++ b/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=i686-- -mattr=-sse | FileCheck %s -check-prefix=WITHNANS
-; RUN: llc < %s -mtriple=i686-- -mattr=-sse -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS
 
 ; WITHNANS-LABEL: test:
 ; WITHNANS: setnp
diff --git a/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll b/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll
index 8411a40..ff7a99a 100644
--- a/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll
+++ b/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -enable-unsafe-fp-math -mtriple=i686-- | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
 ; rdar://5902801
 
 declare void @test2()
diff --git a/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
index 6ebbb2e..0e0e20f 100644
--- a/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
+++ b/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -enable-unsafe-fp-math
+; RUN: llc < %s
 ; <rdar://problem/12180135>
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx10.8.0"
diff --git a/llvm/test/CodeGen/X86/avx-minmax.ll b/llvm/test/CodeGen/X86/avx-minmax.ll
index 6da04c5..8e4b6c6 100644
--- a/llvm/test/CodeGen/X86/avx-minmax.ll
+++ b/llvm/test/CodeGen/X86/avx-minmax.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -enable-no-nans-fp-math | FileCheck %s
 
 define <2 x double> @maxpd(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: maxpd:
diff --git a/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll
index f827998..eb9de8a 100644
--- a/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll
+++ b/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=CHECK_UNSAFE
 ; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -mattr=+avx512f | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64 -enable-no-signed-zeros-fp-math -mattr=+avx512f | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s
 
 define <16 x float> @test_max_v16f32(ptr %a_ptr, <16 x float> %b)  {
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
index 5d9784a..1147d79 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s
 
 define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) {
 ; CHECK-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
index b58bae9..1c4d9c6 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s
 
 define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
 ; CHECK-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
index 92bdebb..a8ff969 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s
 
 define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
 ; CHECK-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/bf16-fast-isel.ll b/llvm/test/CodeGen/X86/bf16-fast-isel.ll
new file mode 100644
index 0000000..c659e0e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bf16-fast-isel.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --fast-isel < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define i8 @test_direct_call(ptr %f) nounwind {
+; CHECK-LABEL: test_direct_call:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    callq bar@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %call = call bfloat @foo(ptr %f)
+  %call2 = call zeroext i8 @bar(bfloat %call)
+  ret i8 %call2
+}
+
+define i8 @test_fast_direct_call(ptr %f) nounwind {
+; CHECK-LABEL: test_fast_direct_call:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo_fast@PLT
+; CHECK-NEXT:    callq bar@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %call = call fastcc bfloat @foo_fast(ptr %f)
+  %call2 = call zeroext i8 @bar(bfloat %call)
+  ret i8 %call2
+}
+
+define i8 @test_indirect_all(ptr %fptr, ptr %f) nounwind {
+; CHECK-LABEL: test_indirect_all:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    callq *%rbx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %call = call bfloat @foo(ptr %f)
+  %call2 = call zeroext i8 %fptr(bfloat %call)
+  ret i8 %call2
+}
+
+define i8 @test_fast_indirect_all(ptr %fptr, ptr %f) nounwind {
+; CHECK-LABEL: test_fast_indirect_all:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    callq *%rbx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %call = call fastcc bfloat @foo(ptr %f)
+  %call2 = call zeroext i8 %fptr(bfloat %call)
+  ret i8 %call2
+}
+
+declare bfloat @foo(ptr %f)
+declare zeroext i8 @bar(bfloat)
+declare fastcc bfloat @foo_fast(ptr %f)
diff --git a/llvm/test/CodeGen/X86/dag-fmf-cse.ll b/llvm/test/CodeGen/X86/dag-fmf-cse.ll
index 609ccdc..cdcc082 100644
--- a/llvm/test/CodeGen/X86/dag-fmf-cse.ll
+++ b/llvm/test/CodeGen/X86/dag-fmf-cse.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma | FileCheck %s
 
 ; If fast-math-flags are propagated correctly, the mul1 expression
 ; should be recognized as a factor in the last fsub, so we should
diff --git a/llvm/test/CodeGen/X86/fabs.ll b/llvm/test/CodeGen/X86/fabs.ll
index 82c82ac..4e6da83 100644
--- a/llvm/test/CodeGen/X86/fabs.ll
+++ b/llvm/test/CodeGen/X86/fabs.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3                                                | FileCheck %s --check-prefix=X87
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s --check-prefix=X87UNSAFE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3 -enable-no-nans-fp-math | FileCheck %s --check-prefix=X87UNSAFE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown                                                                      | FileCheck %s --check-prefix=X64
 
 declare float @fabsf(float)
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 0fe107c..aae6cda 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -22,25 +22,24 @@ declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
 define float @test_fmaximumnum(float %x, float %y) nounwind {
 ; SSE2-LABEL: test_fmaximumnum:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    js .LBB0_2
-; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    js .LBB0_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    jmp .LBB0_3
+; SSE2-NEXT:  .LBB0_1:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:  .LBB0_3:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:  .LBB0_2:
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    cmpordss %xmm3, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm4
-; SSE2-NEXT:    andps %xmm3, %xmm4
-; SSE2-NEXT:    js .LBB0_4
-; SSE2-NEXT:  # %bb.3:
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:  .LBB0_4:
-; SSE2-NEXT:    maxss %xmm1, %xmm3
-; SSE2-NEXT:    andnps %xmm3, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    maxss %xmm2, %xmm3
+; SSE2-NEXT:    movaps %xmm3, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm3, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fmaximumnum:
@@ -56,7 +55,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind {
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
 ; AVX1-NEXT:  .LBB0_3:
 ; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -70,7 +69,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
@@ -95,7 +94,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind {
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
 ; X86-NEXT:  .LBB0_3:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -371,26 +370,25 @@ define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"
 ; SSE2-LABEL: test_fmaximumnum_nsz:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    cmpordss %xmm0, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm3
-; SSE2-NEXT:    andps %xmm0, %xmm3
-; SSE2-NEXT:    maxss %xmm1, %xmm0
-; SSE2-NEXT:    andnps %xmm0, %xmm2
-; SSE2-NEXT:    orps %xmm3, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    maxss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm1
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fmaximumnum_nsz:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: test_fmaximumnum_nsz:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
@@ -404,9 +402,9 @@ define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
-; X86-NEXT:    vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
-; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm1
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
@@ -421,23 +419,22 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
 ; SSE2-NEXT:    divss %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    js .LBB9_2
-; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    movaps %xmm1, %xmm3
-; SSE2-NEXT:  .LBB9_2:
-; SSE2-NEXT:    movaps %xmm3, %xmm2
-; SSE2-NEXT:    cmpordss %xmm3, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm4
-; SSE2-NEXT:    andps %xmm3, %xmm4
-; SSE2-NEXT:    js .LBB9_4
-; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    js .LBB9_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    jmp .LBB9_3
+; SSE2-NEXT:  .LBB9_1:
+; SSE2-NEXT:    movaps %xmm1, %xmm2
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:  .LBB9_4:
-; SSE2-NEXT:    maxss %xmm1, %xmm3
+; SSE2-NEXT:  .LBB9_3:
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    maxss %xmm2, %xmm3
+; SSE2-NEXT:    movaps %xmm3, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm3, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm2
 ; SSE2-NEXT:    andnps %xmm3, %xmm2
-; SSE2-NEXT:    orps %xmm4, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fmaximumnum_combine_cmps:
@@ -454,7 +451,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX1-NEXT:  .LBB9_3:
 ; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -469,7 +466,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
 ; AVX512F-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512F-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512F-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512F-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
@@ -507,7 +504,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
 ; X86-NEXT:    vmovaps %xmm1, %xmm0
 ; X86-NEXT:  .LBB9_3:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -527,23 +524,23 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    js .LBB10_2
-; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:  .LBB10_2:
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    cmpordss %xmm3, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm4
-; SSE2-NEXT:    andps %xmm3, %xmm4
-; SSE2-NEXT:    js .LBB10_4
-; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    js .LBB10_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    jmp .LBB10_3
+; SSE2-NEXT:  .LBB10_1:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:  .LBB10_4:
-; SSE2-NEXT:    minss %xmm0, %xmm3
+; SSE2-NEXT:  .LBB10_3:
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    minss %xmm2, %xmm3
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    cmpunordss %xmm3, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm2
 ; SSE2-NEXT:    andnps %xmm3, %xmm2
-; SSE2-NEXT:    orps %xmm4, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm0, %xmm1
+; SSE2-NEXT:    orps %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fminimumnum:
@@ -559,7 +556,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; AVX1-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX1-NEXT:  .LBB10_3:
 ; AVX1-NEXT:    vminss %xmm2, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -573,7 +570,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
@@ -599,7 +596,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:  .LBB10_3:
 ; X86-NEXT:    vminss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -857,26 +854,25 @@ define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
 ; SSE2-LABEL: test_fminimumnum_nsz:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    cmpordss %xmm0, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm3
-; SSE2-NEXT:    andps %xmm0, %xmm3
-; SSE2-NEXT:    minss %xmm1, %xmm0
-; SSE2-NEXT:    andnps %xmm0, %xmm2
-; SSE2-NEXT:    orps %xmm3, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    minss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm1
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fminimumnum_nsz:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: test_fminimumnum_nsz:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
@@ -890,9 +886,9 @@ define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
-; X86-NEXT:    vminss {{[0-9]+}}(%esp), %xmm0, %xmm2
-; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vminss {{[0-9]+}}(%esp), %xmm0, %xmm1
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
@@ -907,23 +903,23 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; SSE2-NEXT:    divss %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movaps %xmm1, %xmm3
-; SSE2-NEXT:    js .LBB19_2
-; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:  .LBB19_2:
-; SSE2-NEXT:    movaps %xmm3, %xmm2
-; SSE2-NEXT:    cmpordss %xmm3, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm4
-; SSE2-NEXT:    andps %xmm3, %xmm4
-; SSE2-NEXT:    js .LBB19_4
-; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    js .LBB19_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    jmp .LBB19_3
+; SSE2-NEXT:  .LBB19_1:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:  .LBB19_4:
-; SSE2-NEXT:    minss %xmm0, %xmm3
+; SSE2-NEXT:  .LBB19_3:
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    minss %xmm2, %xmm3
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    cmpunordss %xmm3, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm2
 ; SSE2-NEXT:    andnps %xmm3, %xmm2
-; SSE2-NEXT:    orps %xmm4, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm0, %xmm1
+; SSE2-NEXT:    orps %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fminimumnum_combine_cmps:
@@ -940,7 +936,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; AVX1-NEXT:    vmovaps %xmm2, %xmm0
 ; AVX1-NEXT:  .LBB19_3:
 ; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -955,7 +951,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; AVX512F-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
 ; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512F-NEXT:    vminss %xmm2, %xmm0, %xmm1
-; AVX512F-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512F-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
 ; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512F-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512F-NEXT:    retq
@@ -994,7 +990,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; X86-NEXT:    vmovaps %xmm2, %xmm0
 ; X86-NEXT:  .LBB19_3:
 ; X86-NEXT:    vminss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -1022,9 +1018,9 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
 ; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; SSE2-NEXT:    minpd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
-; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
 ; SSE2-NEXT:    andnpd %xmm1, %xmm0
 ; SSE2-NEXT:    orpd %xmm3, %xmm0
 ; SSE2-NEXT:    retq
@@ -1034,7 +1030,7 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
-; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vcmpunordpd %xmm1, %xmm1, %xmm2
 ; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -1048,7 +1044,7 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
 ; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
 ; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordpd %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
@@ -1084,19 +1080,17 @@ define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    xorpd %xmm1, %xmm1
 ; SSE2-NEXT:    minpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm0, %xmm2
-; SSE2-NEXT:    cmpordpd %xmm0, %xmm2
-; SSE2-NEXT:    andpd %xmm2, %xmm0
-; SSE2-NEXT:    andnpd %xmm1, %xmm2
-; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fminimumnum_vector_zero:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vandnpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fminimumnum_vector_zero:
@@ -1108,9 +1102,9 @@ define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) {
 ; X86-LABEL: test_fminimumnum_vector_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vandnpd %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>)
   ret <2 x double> %r
@@ -1120,20 +1114,21 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
 ; SSE2-LABEL: test_fmaximumnum_vector_signed_zero:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; SSE2-NEXT:    maxps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    cmpordps %xmm0, %xmm2
-; SSE2-NEXT:    andps %xmm2, %xmm0
-; SSE2-NEXT:    andnps %xmm1, %xmm2
-; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    maxps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fmaximumnum_vector_signed_zero:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero:
@@ -1144,9 +1139,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
 ; X86-LABEL: test_fmaximumnum_vector_signed_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
-; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>)
   ret <4 x float> %r
@@ -1155,13 +1150,14 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
 define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
 ; SSE2-LABEL: test_fminimumnum_vector_partially_zero:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    cmpordpd %xmm0, %xmm1
-; SSE2-NEXT:    xorpd %xmm2, %xmm2
-; SSE2-NEXT:    movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    movapd %xmm1, %xmm2
 ; SSE2-NEXT:    minpd %xmm0, %xmm2
-; SSE2-NEXT:    andpd %xmm1, %xmm0
-; SSE2-NEXT:    andnpd %xmm2, %xmm1
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm2, %xmm0
+; SSE2-NEXT:    andpd %xmm0, %xmm1
+; SSE2-NEXT:    andnpd %xmm2, %xmm0
 ; SSE2-NEXT:    orpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1169,9 +1165,9 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero:
@@ -1185,9 +1181,9 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>)
   ret <2 x double> %r
@@ -1212,9 +1208,9 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; SSE2-NEXT:    minpd %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
-; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
 ; SSE2-NEXT:    andnpd %xmm1, %xmm0
 ; SSE2-NEXT:    orpd %xmm3, %xmm0
 ; SSE2-NEXT:    retq
@@ -1226,7 +1222,7 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
-; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vcmpunordpd %xmm1, %xmm1, %xmm2
 ; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -1244,7 +1240,7 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
 ; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
 ; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordpd %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>)
@@ -1278,20 +1274,24 @@ define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) {
 define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
 ; SSE2-LABEL: test_fminimumnum_vector_nan:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorpd %xmm2, %xmm2
 ; SSE2-NEXT:    xorpd %xmm1, %xmm1
 ; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; SSE2-NEXT:    minpd %xmm0, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    minpd %xmm0, %xmm2
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm2, %xmm0
+; SSE2-NEXT:    andpd %xmm0, %xmm1
+; SSE2-NEXT:    andnpd %xmm2, %xmm0
+; SSE2-NEXT:    orpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fminimumnum_vector_nan:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0]
-; AVX-NEXT:    vminpd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fminimumnum_vector_nan:
@@ -1306,7 +1306,7 @@ define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vcmpordpd %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
 ; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>)
@@ -1318,19 +1318,17 @@ define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    xorpd %xmm1, %xmm1
 ; SSE2-NEXT:    minpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm0, %xmm2
-; SSE2-NEXT:    cmpordpd %xmm0, %xmm2
-; SSE2-NEXT:    andpd %xmm2, %xmm0
-; SSE2-NEXT:    andnpd %xmm1, %xmm2
-; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fminimumnum_vector_zero_first:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vandnpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fminimumnum_vector_zero_first:
@@ -1342,9 +1340,9 @@ define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) {
 ; X86-LABEL: test_fminimumnum_vector_zero_first:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vandnpd %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x)
   ret <2 x double> %r
@@ -1378,20 +1376,21 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
 ; SSE2-LABEL: test_fmaximumnum_vector_signed_zero_first:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; SSE2-NEXT:    maxps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    cmpordps %xmm0, %xmm2
-; SSE2-NEXT:    andps %xmm2, %xmm0
-; SSE2-NEXT:    andnps %xmm1, %xmm2
-; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    maxps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first:
@@ -1402,9 +1401,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
 ; X86-LABEL: test_fmaximumnum_vector_signed_zero_first:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
-; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x)
   ret <4 x float> %r
@@ -1455,11 +1454,11 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    maxps %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    cmpordps %xmm0, %xmm2
-; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    cmpunordps %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    andnps %xmm1, %xmm2
-; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fmaximumnum_v4f32_splat:
@@ -1468,7 +1467,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordps %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -1478,7 +1477,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
 ; AVX512-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
 ; AVX512-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
-; AVX512-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX512-NEXT:    vcmpunordps %xmm1, %xmm1, %xmm2
 ; AVX512-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1494,7 +1493,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
 ; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
 ; X86-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordps %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    retl
   %splatinsert = insertelement <4 x float> poison, float %y, i64 0
@@ -1506,134 +1505,130 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
 define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
 ; SSE2-LABEL: test_fmaximumnum_v4f16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    subq $104, %rsp
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    subq $136, %rsp
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    psrld $16, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    psrld $16, %xmm0
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    js .LBB33_2
-; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:  .LBB33_2:
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    cmpordss %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    andps %xmm2, %xmm3
-; SSE2-NEXT:    js .LBB33_4
-; SSE2-NEXT:  # %bb.3:
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:  .LBB33_4:
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    maxss %xmm4, %xmm2
-; SSE2-NEXT:    andnps %xmm2, %xmm0
-; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    js .LBB33_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    jmp .LBB33_3
+; SSE2-NEXT:  .LBB33_1:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB33_3:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    maxss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    andps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    callq __truncsfhf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    js .LBB33_6
+; SSE2-NEXT:    js .LBB33_4
 ; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:  .LBB33_6:
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    cmpordss %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    andps %xmm2, %xmm3
-; SSE2-NEXT:    js .LBB33_8
-; SSE2-NEXT:  # %bb.7:
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:  .LBB33_8:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    jmp .LBB33_6
+; SSE2-NEXT:  .LBB33_4:
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psrlq $48, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psrlq $48, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    maxss %xmm4, %xmm2
-; SSE2-NEXT:    andnps %xmm2, %xmm0
-; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB33_6:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    maxss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    andps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    callq __truncsfhf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    js .LBB33_10
-; SSE2-NEXT:  # %bb.9:
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:  .LBB33_10:
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    cmpordss %xmm2, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm3
-; SSE2-NEXT:    andps %xmm2, %xmm3
-; SSE2-NEXT:    js .LBB33_12
-; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:  .LBB33_12:
-; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    js .LBB33_7
+; SSE2-NEXT:  # %bb.8:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    jmp .LBB33_9
+; SSE2-NEXT:  .LBB33_7:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB33_9:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    maxss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    andnps %xmm2, %xmm1
-; SSE2-NEXT:    orps %xmm3, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    andps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    callq __truncsfhf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movd (%rsp), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    js .LBB33_14
-; SSE2-NEXT:  # %bb.13:
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:  .LBB33_14:
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    cmpordss %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    andps %xmm2, %xmm3
-; SSE2-NEXT:    js .LBB33_16
-; SSE2-NEXT:  # %bb.15:
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:  .LBB33_16:
-; SSE2-NEXT:    maxss %xmm4, %xmm2
-; SSE2-NEXT:    andnps %xmm2, %xmm0
-; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    js .LBB33_10
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    jmp .LBB33_12
+; SSE2-NEXT:  .LBB33_10:
+; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB33_12:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    maxss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    andps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    callq __truncsfhf2@PLT
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -1641,7 +1636,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    addq $104, %rsp
+; SSE2-NEXT:    addq $136, %rsp
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fmaximumnum_v4f16:
@@ -1679,7 +1674,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    callq __truncsfhf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1700,7 +1695,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
 ; AVX1-NEXT:  .LBB33_6:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    callq __truncsfhf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1721,7 +1716,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
 ; AVX1-NEXT:  .LBB33_9:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    callq __truncsfhf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
@@ -1742,7 +1737,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
 ; AVX1-NEXT:  .LBB33_12:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    callq __truncsfhf2@PLT
 ; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
@@ -1768,7 +1763,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vmovss %xmm2, %xmm3, %xmm3 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm4, %xmm3, %xmm2
-; AVX512-NEXT:    vcmpordss %xmm3, %xmm3, %k1
+; AVX512-NEXT:    vcmpunordss %xmm2, %xmm2, %k1
 ; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
 ; AVX512-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
@@ -1783,7 +1778,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm5, %xmm4, %xmm3
-; AVX512-NEXT:    vcmpordss %xmm4, %xmm4, %k1
+; AVX512-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
 ; AVX512-NEXT:    vmovss %xmm4, %xmm3, %xmm3 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
@@ -1799,7 +1794,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm5, %xmm4, %xmm3
-; AVX512-NEXT:    vcmpordss %xmm4, %xmm4, %k1
+; AVX512-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
 ; AVX512-NEXT:    vmovss %xmm4, %xmm3, %xmm3 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
@@ -1814,7 +1809,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm5, %xmm6, %xmm6 {%k1}
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm6, %xmm5, %xmm4
-; AVX512-NEXT:    vcmpordss %xmm5, %xmm5, %k1
+; AVX512-NEXT:    vcmpunordss %xmm4, %xmm4, %k1
 ; AVX512-NEXT:    vmovss %xmm5, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -1831,7 +1826,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm5, %xmm4, %xmm3
-; AVX512-NEXT:    vcmpordss %xmm4, %xmm4, %k1
+; AVX512-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
 ; AVX512-NEXT:    vmovss %xmm4, %xmm3, %xmm3 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
@@ -1846,7 +1841,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm5, %xmm6, %xmm6 {%k1}
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm6, %xmm5, %xmm4
-; AVX512-NEXT:    vcmpordss %xmm5, %xmm5, %k1
+; AVX512-NEXT:    vcmpunordss %xmm4, %xmm4, %k1
 ; AVX512-NEXT:    vmovss %xmm5, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -1860,7 +1855,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm5, %xmm6, %xmm6 {%k1}
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm6, %xmm5, %xmm4
-; AVX512-NEXT:    vcmpordss %xmm5, %xmm5, %k1
+; AVX512-NEXT:    vcmpunordss %xmm4, %xmm4, %k1
 ; AVX512-NEXT:    vmovss %xmm5, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
@@ -1875,7 +1870,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm1, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm5, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
@@ -1933,7 +1928,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:  .LBB33_3:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __extendhfsf2
@@ -1955,7 +1950,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:  .LBB33_6:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfhf2
@@ -1993,7 +1988,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:  .LBB33_9:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __extendhfsf2
@@ -2015,7 +2010,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:  .LBB33_12:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfhf2
@@ -2041,120 +2036,114 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; SSE2-NEXT:    pushq %rbp
 ; SSE2-NEXT:    pushq %r15
 ; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    pushq %r12
 ; SSE2-NEXT:    pushq %rbx
 ; SSE2-NEXT:    subq $56, %rsp
-; SSE2-NEXT:    pextrw $0, %xmm1, %r14d
-; SSE2-NEXT:    pextrw $0, %xmm0, %r15d
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrld $16, %xmm2
-; SSE2-NEXT:    pextrw $0, %xmm2, %eax
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrld $16, %xmm2
-; SSE2-NEXT:    pextrw $0, %xmm2, %ecx
+; SSE2-NEXT:    psrlq $48, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrlq $48, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
+; SSE2-NEXT:    pextrw $0, %xmm4, %ebp
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
+; SSE2-NEXT:    pextrw $0, %xmm4, %r15d
+; SSE2-NEXT:    pextrw $0, %xmm0, %r12d
+; SSE2-NEXT:    pextrw $0, %xmm1, %r13d
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pextrw $0, %xmm1, %ecx
 ; SSE2-NEXT:    shll $16, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    shll $16, %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    testl %ecx, %ecx
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    js .LBB34_2
-; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
-; SSE2-NEXT:  .LBB34_2:
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1]
-; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,1],xmm0[1,1]
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    cmpordss %xmm7, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm4
-; SSE2-NEXT:    andps %xmm7, %xmm4
-; SSE2-NEXT:    js .LBB34_4
-; SSE2-NEXT:  # %bb.3:
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:  .LBB34_4:
-; SSE2-NEXT:    pextrw $0, %xmm5, %ebp
-; SSE2-NEXT:    pextrw $0, %xmm6, %ebx
-; SSE2-NEXT:    maxss %xmm2, %xmm7
-; SSE2-NEXT:    andnps %xmm7, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    js .LBB34_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    jmp .LBB34_3
+; SSE2-NEXT:  .LBB34_1:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:  .LBB34_3:
+; SSE2-NEXT:    pextrw $0, %xmm2, %ebx
+; SSE2-NEXT:    pextrw $0, %xmm3, %r14d
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    maxss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
 ; SSE2-NEXT:    callq __truncsfbf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    shll $16, %r15d
-; SSE2-NEXT:    movd %r15d, %xmm3
-; SSE2-NEXT:    shll $16, %r14d
-; SSE2-NEXT:    movd %r14d, %xmm2
-; SSE2-NEXT:    testl %r15d, %r15d
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    js .LBB34_6
+; SSE2-NEXT:    shll $16, %r13d
+; SSE2-NEXT:    movd %r13d, %xmm1
+; SSE2-NEXT:    shll $16, %r12d
+; SSE2-NEXT:    movd %r12d, %xmm2
+; SSE2-NEXT:    js .LBB34_4
 ; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    jmp .LBB34_6
+; SSE2-NEXT:  .LBB34_4:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:  .LBB34_6:
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    psrlq $48, %xmm5
-; SSE2-NEXT:    movdqa (%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT:    psrlq $48, %xmm6
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    cmpordss %xmm1, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm4
-; SSE2-NEXT:    andps %xmm1, %xmm4
-; SSE2-NEXT:    js .LBB34_8
-; SSE2-NEXT:  # %bb.7:
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:  .LBB34_8:
-; SSE2-NEXT:    pextrw $0, %xmm5, %r15d
-; SSE2-NEXT:    pextrw $0, %xmm6, %r14d
-; SSE2-NEXT:    maxss %xmm2, %xmm1
-; SSE2-NEXT:    andnps %xmm1, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    maxss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
 ; SSE2-NEXT:    callq __truncsfbf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    shll $16, %ebx
-; SSE2-NEXT:    movd %ebx, %xmm1
+; SSE2-NEXT:    shll $16, %r15d
+; SSE2-NEXT:    movd %r15d, %xmm1
 ; SSE2-NEXT:    shll $16, %ebp
-; SSE2-NEXT:    movd %ebp, %xmm3
-; SSE2-NEXT:    testl %ebx, %ebx
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    js .LBB34_10
-; SSE2-NEXT:  # %bb.9:
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:  .LBB34_10:
+; SSE2-NEXT:    movd %ebp, %xmm2
+; SSE2-NEXT:    js .LBB34_7
+; SSE2-NEXT:  # %bb.8:
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    cmpordss %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm4
-; SSE2-NEXT:    andps %xmm2, %xmm4
-; SSE2-NEXT:    js .LBB34_12
-; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:  .LBB34_12:
-; SSE2-NEXT:    maxss %xmm3, %xmm2
-; SSE2-NEXT:    andnps %xmm2, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    jmp .LBB34_9
+; SSE2-NEXT:  .LBB34_7:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB34_9:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    maxss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
 ; SSE2-NEXT:    callq __truncsfbf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; SSE2-NEXT:    shll $16, %r14d
 ; SSE2-NEXT:    movd %r14d, %xmm1
-; SSE2-NEXT:    shll $16, %r15d
-; SSE2-NEXT:    movd %r15d, %xmm3
-; SSE2-NEXT:    testl %r14d, %r14d
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    js .LBB34_14
-; SSE2-NEXT:  # %bb.13:
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:  .LBB34_14:
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd %ebx, %xmm2
+; SSE2-NEXT:    js .LBB34_10
+; SSE2-NEXT:  # %bb.11:
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    cmpordss %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm4
-; SSE2-NEXT:    andps %xmm2, %xmm4
-; SSE2-NEXT:    js .LBB34_16
-; SSE2-NEXT:  # %bb.15:
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:  .LBB34_16:
-; SSE2-NEXT:    maxss %xmm3, %xmm2
-; SSE2-NEXT:    andnps %xmm2, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    jmp .LBB34_12
+; SSE2-NEXT:  .LBB34_10:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB34_12:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    maxss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
 ; SSE2-NEXT:    callq __truncsfbf2@PLT
 ; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2164,6 +2153,8 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    addq $56, %rsp
 ; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r13
 ; SSE2-NEXT:    popq %r14
 ; SSE2-NEXT:    popq %r15
 ; SSE2-NEXT:    popq %rbp
@@ -2205,7 +2196,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vpextrw $0, %xmm2, %ebp
 ; AVX1-NEXT:    vpextrw $0, %xmm3, %r15d
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    callq __truncsfbf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2222,7 +2213,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX1-NEXT:  .LBB34_6:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    callq __truncsfbf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2239,7 +2230,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX1-NEXT:  .LBB34_9:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    callq __truncsfbf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
@@ -2256,7 +2247,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX1-NEXT:  .LBB34_12:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    callq __truncsfbf2@PLT
 ; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
@@ -2305,7 +2296,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    callq __truncsfbf2@PLT
 ; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
@@ -2319,7 +2310,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    callq __truncsfbf2@PLT
 ; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsp)
@@ -2333,7 +2324,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    callq __truncsfbf2@PLT
 ; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
@@ -2347,7 +2338,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    callq __truncsfbf2@PLT
 ; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
@@ -2400,7 +2391,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vpextrw $0, %xmm2, %edi
 ; X86-NEXT:    vpextrw $0, %xmm3, %ebp
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    shll $16, %ecx
@@ -2416,7 +2407,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
 ; X86-NEXT:  .LBB34_6:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfbf2
@@ -2436,7 +2427,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
 ; X86-NEXT:  .LBB34_9:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfbf2
@@ -2456,7 +2447,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
 ; X86-NEXT:  .LBB34_12:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfbf2
diff --git a/llvm/test/CodeGen/X86/fp-undef.ll b/llvm/test/CodeGen/X86/fp-undef.ll
index 227f007..c358085 100644
--- a/llvm/test/CodeGen/X86/fp-undef.ll
+++ b/llvm/test/CodeGen/X86/fp-undef.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown                        | FileCheck %s --check-prefix=ANY
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck %s --check-prefix=ANY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ANY
 
 ; This is duplicated from tests for InstSimplify. If you're
 ; adding something here, you should probably add it there too.
diff --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll
index 659e4dd..27a651e 100644
--- a/llvm/test/CodeGen/X86/fp128-select.ll
+++ b/llvm/test/CodeGen/X86/fp128-select.ll
@@ -13,8 +13,8 @@ define void @test_select(ptr %p, ptr %q, i1 zeroext %c) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    testl %edx, %edx
 ; SSE-NEXT:    jne .LBB0_1
-; SSE-NEXT:  # %bb.3:
-; SSE-NEXT:    movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:  # %bb.2:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [NaN]
 ; SSE-NEXT:    movaps %xmm0, (%rsi)
 ; SSE-NEXT:    retq
 ; SSE-NEXT:  .LBB0_1:
@@ -58,7 +58,7 @@ define fp128 @test_select_cc(fp128, fp128) nounwind {
 ; SSE-NEXT:    xorps %xmm1, %xmm1
 ; SSE-NEXT:    jmp .LBB1_3
 ; SSE-NEXT:  .LBB1_1:
-; SSE-NEXT:    movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0]
 ; SSE-NEXT:  .LBB1_3: # %BB0
 ; SSE-NEXT:    testl %ebx, %ebx
 ; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/fsxor-alignment.ll b/llvm/test/CodeGen/X86/fsxor-alignment.ll
index 6fa4a31..32af5b9 100644
--- a/llvm/test/CodeGen/X86/fsxor-alignment.ll
+++ b/llvm/test/CodeGen/X86/fsxor-alignment.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s
 
 ; Don't fold the incoming stack arguments into the xorps instructions used
 ; to do floating-point negations, because the arguments aren't vectors
diff --git a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
index f710a30..bd997d1 100644
--- a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse < %s | FileCheck %s
 
 ; The debug info in this test case was causing a crash because machine trace metrics
 ; did not correctly ignore debug instructions. The check lines ensure that the
diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll
index 8020982..18ded50 100644
--- a/llvm/test/CodeGen/X86/neg_fp.ll
+++ b/llvm/test/CodeGen/X86/neg_fp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 | FileCheck %s
 
-; Test that when we don't -enable-unsafe-fp-math, we don't do the optimization
+; Test that when we don't, we don't do the optimization
 ; -0 - (A - B) to (B - A) because A==B, -0 != 0
 
 define float @negfp(float %a, float %b) nounwind {
diff --git a/llvm/test/CodeGen/X86/negate-add-zero.ll b/llvm/test/CodeGen/X86/negate-add-zero.ll
index eb4e2d3..4884832 100644
--- a/llvm/test/CodeGen/X86/negate-add-zero.ll
+++ b/llvm/test/CodeGen/X86/negate-add-zero.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s | FileCheck %s
 ; PR3374
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/llvm/test/CodeGen/X86/recip-pic.ll b/llvm/test/CodeGen/X86/recip-pic.ll
index d01ecc1..d2620e7 100644
--- a/llvm/test/CodeGen/X86/recip-pic.ll
+++ b/llvm/test/CodeGen/X86/recip-pic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu  -enable-unsafe-fp-math -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu  -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK
 
 define fastcc float @foo(float %x) unnamed_addr #0 {
 ; CHECK-LABEL: foo:
diff --git a/llvm/test/CodeGen/X86/sincos-opt.ll b/llvm/test/CodeGen/X86/sincos-opt.ll
index 6885456..51f3e52 100644
--- a/llvm/test/CodeGen/X86/sincos-opt.ll
+++ b/llvm/test/CodeGen/X86/sincos-opt.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_SINCOS
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_NOOPT
 ; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
 ; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS
-; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
+; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
 ; RUN: llc < %s -mtriple=x86_64-scei-ps4 -mcpu=btver2 | FileCheck %s --check-prefix=PS4_SINCOS
 ; RUN: llc < %s -mtriple=x86_64-sie-ps5  -mcpu=znver2 | FileCheck %s --check-prefix=PS4_SINCOS
 
diff --git a/llvm/test/CodeGen/X86/sincos.ll b/llvm/test/CodeGen/X86/sincos.ll
index 7903407..9206c25 100644
--- a/llvm/test/CodeGen/X86/sincos.ll
+++ b/llvm/test/CodeGen/X86/sincos.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Make sure this testcase codegens to the sin and cos instructions, not calls
-; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3  | FileCheck %s
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s
 
 declare float  @sinf(float) readonly
diff --git a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index c0beb6f..2822d40 100644
--- a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CST --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s --check-prefix=CST --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64 -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
 
 ; Check that the constant used in the vectors are the right ones.
 ; SSE2: [[MASKCSTADDR:.LCPI[0-9_]+]]:
diff --git a/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
new file mode 100644
index 0000000..f6e941a
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+;; Aggressive instcombine merges the two i8 stores into an i16 store. Check
+;; the debug location and DIAssignID metadata get merged.
+
+; CHECK: define void @test_i16(i16 %x, ptr %p) !dbg ![[#]] {
+; CHECK-NEXT: store i16 %x, ptr %p, align 1, !dbg ![[DBG:[0-9]+]], !DIAssignID ![[ID:[0-9]+]]
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+;    CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8),
+;    CHECK-SAME: ![[ID]], ptr %p, !DIExpression(), ![[#]])
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+;    CHECK-SAME: !DIExpression(DW_OP_constu, 8, DW_OP_shr, DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 8, 8),
+;    CHECK-SAME: ![[ID]], ptr %p, !DIExpression(DW_OP_plus_uconst, 1), ![[#]])
+; CHECK-NEXT: ret void
+
+; CHECK: ![[DBG]] = !DILocation(line: 0, scope: ![[#]])
+
+define void @test_i16(i16 %x, ptr %p) !dbg !5 {
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p, align 1, !dbg !16, !DIAssignID !17
+    #dbg_assign(i8 %x.0, !9, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !17, ptr %p, !DIExpression(), !18)
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1, align 1, !dbg !19, !DIAssignID !20
+    #dbg_assign(i8 %x.1, !9, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !20, ptr %gep.1, !DIExpression(), !18)
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "/app/example.ll", directory: "/")
+!2 = !{i32 7}
+!3 = !{i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test_i16", linkageName: "test_i16", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty16", size: 16, encoding: DW_ATE_unsigned)
+!16 = !DILocation(line: 2, column: 1, scope: !5)
+!17 = distinct !DIAssignID()
+!18 = !DILocation(line: 1, column: 1, scope: !5)
+!19 = !DILocation(line: 6, column: 1, scope: !5)
+!20 = distinct !DIAssignID()
diff --git a/llvm/test/Instrumentation/AllocToken/basic.ll b/llvm/test/Instrumentation/AllocToken/basic.ll
index 099d37d..0c34b137 100644
--- a/llvm/test/Instrumentation/AllocToken/basic.ll
+++ b/llvm/test/Instrumentation/AllocToken/basic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/basic32.ll b/llvm/test/Instrumentation/AllocToken/basic32.ll
index 944a452..52d1d14 100644
--- a/llvm/test/Instrumentation/AllocToken/basic32.ll
+++ b/llvm/test/Instrumentation/AllocToken/basic32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/fast.ll b/llvm/test/Instrumentation/AllocToken/fast.ll
index 19a3ef6..f6bf5ee 100644
--- a/llvm/test/Instrumentation/AllocToken/fast.ll
+++ b/llvm/test/Instrumentation/AllocToken/fast.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic.ll b/llvm/test/Instrumentation/AllocToken/intrinsic.ll
index 13aaa90..5c6f2f1 100644
--- a/llvm/test/Instrumentation/AllocToken/intrinsic.ll
+++ b/llvm/test/Instrumentation/AllocToken/intrinsic.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; Test that the alloc-token pass lowers the intrinsic to a constant token ID.
 ;
-; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
index eb5dbbe..15f7c25 100644
--- a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
+++ b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; Test that the alloc-token pass lowers the intrinsic to a constant token ID.
 ;
-; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
 target triple = "i386-pc-linux-gnu"
diff --git a/llvm/test/Instrumentation/AllocToken/invoke.ll b/llvm/test/Instrumentation/AllocToken/invoke.ll
index 347c99a..8e7ab38 100644
--- a/llvm/test/Instrumentation/AllocToken/invoke.ll
+++ b/llvm/test/Instrumentation/AllocToken/invoke.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
index 19673da..45f573e 100644
--- a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
+++ b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-extended -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -alloc-token-extended -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
index 1f77648..4d1be5e 100644
--- a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
+++ b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
index 434ac84..3d759f7 100644
--- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
@@ -865,33 +865,6 @@ entry:
   ret float %r
 }
 
-; Note that the `unsafe-fp-math` from the function attributes should be moved to
-; individual instructions, with the shadow instructions NOT getting the attribute.
-define float @param_add_return_float_unsafe_fp_math(float %a) #0 {
-; CHECK-LABEL: @param_add_return_float_unsafe_fp_math(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @param_add_return_float_unsafe_fp_math to i64)
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr @__nsan_shadow_args_ptr, align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
-; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
-; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
-; CHECK-NEXT:    [[B:%.*]] = fadd fast float [[A]], 1.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 1.000000e+00
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_float_d(float [[B]], double [[TMP5]], i32 1, i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = fpext float [[B]] to double
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], double [[TMP8]], double [[TMP5]]
-; CHECK-NEXT:    store i64 ptrtoint (ptr @param_add_return_float_unsafe_fp_math to i64), ptr @__nsan_shadow_ret_tag, align 8
-; CHECK-NEXT:    store double [[TMP9]], ptr @__nsan_shadow_ret_ptr, align 8
-; CHECK-NEXT:    ret float [[B]]
-;
-entry:
-  %b = fadd float %a, 1.0
-  ret float %b
-}
-
-
 define void @truncate(<2 x double> %0) sanitize_numerical_stability {
 ; DQQ-LABEL: @truncate(
 ; DQQ-NEXT:  entry:
@@ -941,4 +914,4 @@ entry:
 }
 
 
-attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/LTO/AArch64/Inputs/bar.ll b/llvm/test/LTO/AArch64/Inputs/bar.ll
new file mode 100644
index 0000000..7c2a753
--- /dev/null
+++ b/llvm/test/LTO/AArch64/Inputs/bar.ll
@@ -0,0 +1,35 @@
+;; This file contains the new semantic of the branch-target-enforcement, sign-return-address.
+;; Used for test mixing a mixed link case and also verify the import too in llc.
+
+; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local void @bar() #0 {
+entry:
+  ret void
+}
+; CHECK-LABEL: bar:
+; CHECK-NOT:       hint
+; CHECK-NOT:       bti
+; CHECK:           ret
+
+define dso_local void @baz() #1 {
+entry:
+  ret void
+}
+
+; CHECK-LABEL: baz:
+; CHECK:           bti c
+; CHECK:           ret
+
+attributes #0 = { noinline nounwind optnone uwtable }
+attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 8, !"branch-target-enforcement", i32 2}
+!1 = !{i32 8, !"sign-return-address", i32 2}
+!2 = !{i32 8, !"sign-return-address-all", i32 2}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 2}
diff --git a/llvm/test/LTO/AArch64/Inputs/fiz.ll b/llvm/test/LTO/AArch64/Inputs/fiz.ll
new file mode 100644
index 0000000..e578426
--- /dev/null
+++ b/llvm/test/LTO/AArch64/Inputs/fiz.ll
@@ -0,0 +1,41 @@
+;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address.
+;; Used for test mixing a mixed link case and also verify the import too in llc.
+
+; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @func()
+
+define i32 @fiz_on() #0 {
+entry:
+  call void @func()
+  ret i32 42
+}
+
+; CHECK-LABEL: fiz_on:
+; CHECK:           paciasp
+; CHECK:           bl func
+; CHECK:           retaa
+
+define i32 @fiz_off() #1 {
+entry:
+  ret i32 43
+}
+
+; CHECK-LABEL: fiz_off:
+; CHECK-NOT:       pac
+; CHECK-NOT:       hint
+; CHECK-NOT:       bti
+; CHECK:           ret
+
+attributes #0 = { noinline nounwind optnone uwtable }
+attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement"="false" "sign-return-address"="none" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 8, !"branch-target-enforcement", i32 1}
+!1 = !{i32 8, !"sign-return-address", i32 1}
+!2 = !{i32 8, !"sign-return-address-all", i32 0}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0}
diff --git a/llvm/test/LTO/AArch64/Inputs/foo.ll b/llvm/test/LTO/AArch64/Inputs/foo.ll
index 961b0d4..689d938 100644
--- a/llvm/test/LTO/AArch64/Inputs/foo.ll
+++ b/llvm/test/LTO/AArch64/Inputs/foo.ll
@@ -1,12 +1,34 @@
+;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address.
+;; Used for test mixing a mixed link case and also verify the import too in llc.
+
+; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s
+
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
-define dso_local i32 @foo() #0 {
+define i32 @foo_on() #0 {
 entry:
   ret i32 42
 }
 
+; CHECK-LABEL: foo_on:
+; CHECK:           pacibsp
+; CHECK:           mov
+; CHECK:           retab
+
+define i32 @foo_off() #1 {
+entry:
+  ret i32 43
+}
+
+; CHECK-LABEL: foo_off:
+; CHECK-NOT:       pac
+; CHECK-NOT:       hint
+; CHECK-NOT:       bti
+; CHECK:           ret
+
 attributes #0 = { noinline nounwind optnone uwtable }
+attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement"="false" "sign-return-address"="none" }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
diff --git a/llvm/test/LTO/AArch64/Inputs/old.ll b/llvm/test/LTO/AArch64/Inputs/old.ll
new file mode 100644
index 0000000..2b1758b
--- /dev/null
+++ b/llvm/test/LTO/AArch64/Inputs/old.ll
@@ -0,0 +1,59 @@
+;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address.
+;; Used for test mixing a mixed link case and also verify the import too in llc.
+
+; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @old_bti() #0 {
+entry:
+  ret i32 2
+}
+
+; CHECK-LABEL: old_bti:
+; CHECK:           bti c
+; CHECK:           mov
+; CHECK:           ret
+
+define i32 @old_pac() #1 {
+entry:
+  ret i32 2
+}
+
+; CHECK-LABEL: old_pac:
+; CHECK:           paciasp
+; CHECK:           mov
+; CHECK:           retaa
+
+
+define i32 @old_none() #2 {
+entry:
+  ret i32 3
+}
+
+; CHECK-LABEL: old_none:
+; CHECK-NOT:           hint
+; CHECK-NOT:           paci
+; CHECK-NOT:           bti
+; CHECK:           ret
+
+declare i32 @func(i32)
+
+define i32 @old_none_leaf() #3 {
+entry:
+  %0 = call i32 @func()
+  ret i32 %0
+}
+
+; CHECK-LABEL: old_none_leaf:
+; CHECK:           paciasp
+; CHECK:           bl      func
+; CHECK:           retaa
+
+attributes #0 = { noinline nounwind optnone "branch-target-enforcement"="true" }
+attributes #1 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="all" "sign-return-address-key"="a_key" }
+attributes #2 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="none" }
+attributes #3 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" }
+
+;; Intentionally no module flags
diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
index b3c9828..20254de 100644
--- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
@@ -1,10 +1,10 @@
-; Testcase to check that module with different branch-target-enforcement can
-; be mixed.
-;
+;; Testcase to check that module with different branch-target-enforcement can
+;; be mixed.
+;;
 ; RUN: llvm-as %s -o %t1.bc
 ; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
 ; RUN: llvm-lto -exported-symbol main \
-; RUN:          -exported-symbol foo \
+; RUN:          -exported-symbol foo_on \
 ; RUN:          -filetype=obj \
 ; RUN:           %t1.bc %t2.bc \
 ; RUN:           -o %t1.exe 2>&1 | FileCheck --allow-empty %s
@@ -14,11 +14,11 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
-declare i32 @foo();
+declare i32 @foo_on();
 
 define i32 @main() "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" {
 entry:
-  %add = call i32 @foo()
+  %add = call i32 @foo_on()
   ret i32 %add
 }
 
@@ -30,9 +30,12 @@ entry:
 
 ; CHECK-NOT: linking module flags 'branch-target-enforcement': IDs have conflicting values in
 ; CHECK-DUMP: <main>:
+; CHECK-DUMP:      paciasp
+; CHECK-DUMP:      str
 ; CHECK-DUMP:      bl      0x8 <main+0x8>
-; CHECK-DUMP: <foo>:
+; CHECK-DUMP: <foo_on>:
+; CHECK-DUMP:     pacibsp
 
-; `main` doesn't support BTI while `foo` does, so in the binary
-; we should see only PAC which is supported by both.
+;; `main` doesn't support BTI while `foo` does, so in the binary
+;; we should see only PAC which is supported by both.
 ; CHECK-PROP:   Properties: aarch64 feature: PAC
 \ No newline at end of file
diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
new file mode 100644
index 0000000..331e481
--- /dev/null
+++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll
@@ -0,0 +1,127 @@
+;; Testcase to check that module with different sign return address can
+;; be mixed.
+;
+; RUN: llvm-as %s -o %t1.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
+; RUN: llvm-as %p/Inputs/fiz.ll -o %t3.bc
+; RUN: llvm-as %p/Inputs/bar.ll -o %t4.bc
+; RUN: llvm-as %p/Inputs/old.ll -o %t5.bc
+; RUN: llvm-lto -exported-symbol main \
+; RUN:          -exported-symbol foo_on \
+; RUN:          -exported-symbol foo_off \
+; RUN:          -exported-symbol fiz_on \
+; RUN:          -exported-symbol fiz_off \
+; RUN:          -exported-symbol bar \
+; RUN:          -exported-symbol baz \
+; RUN:          -exported-symbol old_bti \
+; RUN:          -exported-symbol old_pac \
+; RUN:          -exported-symbol old_none \
+; RUN:          -filetype=obj \
+; RUN:          %t5.bc %t4.bc %t3.bc %t2.bc %t1.bc \
+; RUN:           -o %t1.exe 2>&1
+; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s
+; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+declare i32 @foo_on();
+declare i32 @foo_off();
+declare i32 @fiz_on();
+declare i32 @fiz_off();
+declare void @baz();
+declare void @bar();
+declare i32 @old_bti();
+declare i32 @old_pac();
+declare i32 @old_none();
+
+define i32 @main() #0 {
+entry:
+  call i32 @foo_on()
+  call i32 @foo_off()
+  call i32 @fiz_on()
+  call i32 @fiz_off()
+  call void @bar()
+  call void @baz()
+  call i32 @old_bti()
+  call i32 @old_pac()
+  call i32 @old_none()
+  ret i32 0
+}
+
+attributes #0 = { noinline nounwind optnone }
+
+!llvm.module.flags = !{!0, !1, !2, !3 }
+!0 = !{i32 8, !"branch-target-enforcement", i32 0}
+!1 = !{i32 8, !"sign-return-address", i32 0}
+!2 = !{i32 8, !"sign-return-address-all", i32 0}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0}
+
+
+; CHECK-DUMP-LABEL: <old_bti>:
+; CHECK-DUMP-NEXT:     bti c
+; CHECK-DUMP-NEXT:     mov     w0, #0x2
+; CHECK-DUMP-NEXT:     ret
+
+; CHECK-DUMP-LABEL: <old_pac>:
+; CHECK-DUMP-NEXT:     paciasp
+; CHECK-DUMP-NEXT:     mov     w0, #0x2
+; CHECK-DUMP-NEXT:     autiasp
+; CHECK-DUMP-NEXT:     ret
+
+; CHECK-DUMP-LABEL: <old_none>:
+; CHECK-DUMP-NEXT:     mov     w0, #0x3
+; CHECK-DUMP-NEXT:     ret
+
+; CHECK-DUMP-LABEL: <bar>:
+; CHECK-DUMP-NEXT:     ret
+
+; CHECK-DUMP-LABEL: <baz>:
+; CHECK-DUMP-NEXT:     bti c
+; CHECK-DUMP-NEXT:     ret
+
+;; fiz.ll represents a module with the old style of the function attributes.
+;; fiz_on shall have PAC with A-key as it requested at module level.
+; CHECK-DUMP-LABEL: <fiz_on>:
+; CHECK-DUMP-NEXT:     paciasp
+; CHECK-DUMP-NEXT:     str     x30, [sp, #-0x10]!
+; CHECK-DUMP-NEXT:     bl      0x38 <fiz_on+0x8>
+; CHECK-DUMP-NEXT:     mov     w0, #0x2a
+; CHECK-DUMP-NEXT:     ldr     x30, [sp], #0x10
+; CHECK-DUMP-NEXT:     autiasp
+; CHECK-DUMP-NEXT:     ret
+
+;; fiz_off shall not have BTI or PAC instructions as they are disabled at function scope.
+; CHECK-DUMP-LABEL:  <fiz_off>:
+; CHECK-DUMP-NEXT:       mov     w0, #0x2b
+; CHECK-DUMP-NEXT:       ret
+
+;; foo.ll represents a module with the old style of the function attributes.
+;; foo_on shall have PAC with B-key as it requested at module level.
+; CHECK-DUMP-LABEL: <foo_on>:
+; CHECK-DUMP-NEXT:     pacibsp
+; CHECK-DUMP-NEXT:     mov     w0, #0x2a
+; CHECK-DUMP-NEXT:     autibsp
+; CHECK-DUMP-NEXT:     ret
+
+;; foo_off shall not have BTI or PAC instructions as they are disabled at function scope.
+; CHECK-DUMP-LABEL:  <foo_off>:
+; CHECK-DUMP-NEXT:       mov     w0, #0x2b
+; CHECK-DUMP-NEXT:       ret
+
+; CHECK-DUMP-LABEL: <main>:
+; CHECK-DUMP-NOT:       paciasp
+; CHECK-DUMP-NEXT:      str     x30,
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+
+;; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary
+;; we should not see anything.
+; CHECK-PROP-NOT:   Properties: aarch64 feature: PAC
diff --git a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll
index a90f212..b5984bf 100644
--- a/llvm/test/Linker/link-arm-and-thumb.ll
+++ b/llvm/test/Linker/link-arm-and-thumb.ll
@@ -13,11 +13,11 @@ entry:
   ret i32 %add
 }
 
-; CHECK: define i32 @main() {
+; CHECK: define i32 @main()
 ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]]
 ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]]
 
-; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" }
-; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" }
+; CHECK: attributes [[ARM_ATTRS]] = {{{.*}}"target-features"="-thumb-mode" }
+; CHECK: attributes [[THUMB_ATTRS]] = {{{.*}}"target-features"="+thumb-mode" }
 
 ; STDERR-NOT: warning: Linking two modules of different target triples:
diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s
index 78aa8f2..3faea99 100644
--- a/llvm/test/MC/AMDGPU/literals.s
+++ b/llvm/test/MC/AMDGPU/literals.s
@@ -20,282 +20,282 @@
 //---------------------------------------------------------------------------//
 
 v_fract_f64 v[0:1], 0.5
-// SICI: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
 
 v_sqrt_f64 v[0:1], -4.0
-// SICI: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
-// GFX89: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x50,0x00,0x7e]
-// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
 // GFX11: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
+// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
+// GFX89: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x50,0x00,0x7e]
+// SICI: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
 
 v_log_clamp_f32 v1, 0.5
 // NOGFX8PLUS: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // SICI: v_log_clamp_f32_e32 v1, 0.5             ; encoding: [0xf0,0x4c,0x02,0x7e]
 
 v_trunc_f32 v0, 0.5
-// SICI: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], -1.0
-// SICI: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
 
 v_trunc_f32 v0, -1.0
-// SICI: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], 4.0
-// SICI: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
 
 v_trunc_f32 v0, 4.0
-// SICI: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], 0.0
-// SICI: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
 
 v_trunc_f32 v0, 0.0
-// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], 1.5
-// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
 // GFX11: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
 
 v_trunc_f32 v0, 1.5
-// SICI: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
-// GFX89: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f]
-// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
 // GFX11: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// GFX89: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// SICI: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
 
 v_fract_f64 v[0:1], -3.1415
-// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
-// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
 // GFX12: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
 // GFX1250: v_fract_f64_e32 v[0:1], 0xc00921cac083126f ; encoding: [0xfe,0x7c,0x00,0x7e,0x6f,0x12,0x83,0xc0,0xca,0x21,0x09,0xc0]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, -3.1415
-// SICI: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
-// GFX89: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0]
-// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
 // GFX11: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// GFX89: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// SICI: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
 
 v_fract_f64 v[0:1], 100000000000000000000000.0
-// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
-// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
 // GFX12: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x44b52d02c7e14af6 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf6,0x4a,0xe1,0xc7,0x02,0x2d,0xb5,0x44]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 100000000000000000000000.0
-// SICI: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
-// GFX89: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65]
-// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
 // GFX11: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// GFX89: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// SICI: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
 
 v_fract_f64 v[0:1], 10000000.0
-// SICI: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
-// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
 // GFX11: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// SICI: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
 
 v_trunc_f32 v0, 10000000.0
-// SICI: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
-// GFX89: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b]
-// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
 // GFX11: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// GFX89: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// SICI: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
 
 v_fract_f64 v[0:1], 3.402823e+38
-// SICI: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
-// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
 // GFX12: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x47efffff966ad924 ; encoding: [0xfe,0x7c,0x00,0x7e,0x24,0xd9,0x6a,0x96,0xff,0xff,0xef,0x47]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 3.402823e+38
-// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
-// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
-// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
 // GFX11: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
 
 v_fract_f64 v[0:1], 2.3509886e-38
-// SICI: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
-// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
 // GFX12: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x381fffffe8c9d9fb ; encoding: [0xfe,0x7c,0x00,0x7e,0xfb,0xd9,0xc9,0xe8,0xff,0xff,0x1f,0x38]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 2.3509886e-38
-// SICI: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
-// GFX89: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
 // GFX11: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
+// GFX89: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00]
+// SICI: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
 
 v_fract_f64 v[0:1], 2.3509886e-70
-// SICI: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
 // GFX12: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x3179f623c2d3cf3c ; encoding: [0xfe,0x7c,0x00,0x7e,0x3c,0xcf,0xd3,0xc2,0x23,0xf6,0x79,0x31]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 2.3509886e-70
 // NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction
 
 v_fract_f64_e32 v[0:1], 1.0
-// SICI: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], lit(1.0)
-// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // GFX11: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // GFX12: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // GFX1250: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xfe,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f,0x00,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f]
+// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0xca,0x1b]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xc2,0x0a,0x7e]
 // GFX1250: v_cos_f16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xc2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, lit(0x3c00)         ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00]
 // GFX1250: v_cos_f16_e32 v5.l, lit(0x3c00)         ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, 1.0                 ; encoding: [0xf2,0x94,0x0a,0x7e]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, lit(0x3f80)         ; encoding: [0xff,0x94,0x0a,0x7e,0x80,0x3f,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_trunc_f32_e32 v0, 1.0
-// SICI: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
 
 v_trunc_f32_e32 v0, lit(1.0)
-// SICI: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
-// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
 // GFX11: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// SICI: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
 
 v_dot2_bf16_bf16 v5.l, v1, v2, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1.0      ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_bf16_bf16 v5.l, v1, v2, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x3f80) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x80,0x3f,0x00,0x00]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, 1.0, v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, 1.0, v2          ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c]
 // GFX12: v_dot2_f32_f16 v5, v1, 1.0, v2          ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, lit(1.0), v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2  ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00]
 // GFX12: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2  ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, 0x3c00           ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x3c00)      ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // fp literal, expected int operand
@@ -309,118 +309,118 @@ s_mov_b64 s[0:1], lit(0.5)
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 0.5, v1
-// SICI: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 0.5, v1
-// SICI: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], -1.0
 // GFX8PLUS: s_mov_b64 s[0:1], -1.0                  ; encoding: [0xf3,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], -1.0                  ; encoding: [0xf3,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, -1.0, v1
-// SICI: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, -1.0, v1
-// SICI: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 4.0
 // GFX8PLUS: s_mov_b64 s[0:1], 4.0                   ; encoding: [0xf6,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 4.0                   ; encoding: [0xf6,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 4.0, v1
-// SICI: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 4.0, v1
-// SICI: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 0.0
 // GFX8PLUS: s_mov_b64 s[0:1], 0                     ; encoding: [0x80,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 0                     ; encoding: [0x80,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 0.0, v1
-// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 0.0, v1
-// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 1.5
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 1.5, v1
-// SICI: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
-// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f]
-// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
 // GFX11: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
+// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
+// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f]
+// SICI: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
 
 s_mov_b64_e32 s[0:1], -3.1415
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, -3.1415, v1
-// SICI: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
-// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0]
-// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
 // GFX11: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
+// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
+// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0]
+// SICI: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
 
 s_mov_b64_e32 s[0:1], 100000000000000000000000.0
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 100000000000000000000000.0, v1
-// SICI: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
-// GFX89: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65]
-// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
 // GFX11: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
+// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
+// GFX89: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65]
+// SICI: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
 
 s_mov_b64_e32 s[0:1], 10000000.0
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 10000000.0, v1
-// SICI: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
-// GFX89: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b]
-// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
 // GFX11: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
+// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
+// GFX89: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b]
+// SICI: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
 
 s_mov_b64_e32 s[0:1], 3.402823e+38
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 3.402823e+38, v1
-// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
-// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f]
-// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
 // GFX11: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
+// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
+// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f]
+// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
 
 s_mov_b64_e32 s[0:1], 2.3509886e-38
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 2.3509886e-38, v1
-// SICI: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
-// GFX89: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00]
-// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
 // GFX11: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
+// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
+// GFX89: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00]
+// SICI: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
 
 s_mov_b64_e32 s[0:1], 2.3509886e-70
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
@@ -429,322 +429,322 @@ v_and_b32_e32 v0, 2.3509886e-70, v1
 // NOGCN: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_not_b16 v5.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xd2,0x0a,0x7e]
 // GFX1250: v_not_b16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xd2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_not_b16 v5.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, lit(0x3f800000)     ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f]
 // GFX1250: v_not_b16_e32 v5.l, lit(0x3f800000)     ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_and_b32_e32 v0, 1.0, v1
-// SICI: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
 
 v_and_b32_e32 v0, lit(1.0), v1
-// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
-// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f]
-// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
 // GFX11: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
+// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
+// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f]
+// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
 
 v_pk_add_u16 v5, exec_lo, 1.0
+// GFX11: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
 // GFX12XX: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0xe4,0x01,0x18]
-// GFX11: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_pk_add_u16 v5, exec_lo, lit(1.0)
-// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
+// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xca,0x03]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x3f800000) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x00,0x00,0x80,0x3f]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // int literal, expected fp operand
 //---------------------------------------------------------------------------//
 
 v_trunc_f32_e32 v0, 0
-// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 1
-// SICI: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], lit(1)
-// SICI: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX12: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX1250: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xfe,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00]
+// SICI: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 
 v_trunc_f32_e64 v0, 0
-// SICI: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00]
 
 v_fract_f64_e64 v[0:1], 0
-// SICI: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00]
 
 v_trunc_f32_e32 v0, -13
-// SICI: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], -13
-// SICI: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
 
 v_trunc_f32_e64 v0, -13
-// SICI: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00]
 
 v_fract_f64_e64 v[0:1], -13
-// SICI: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00]
 
 v_trunc_f32_e32 v0, 35
-// SICI: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 35
-// SICI: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
 
 v_trunc_f32_e64 v0, 35
-// SICI: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00]
 
 v_fract_f64_e64 v[0:1], 35
-// SICI: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00]
 
 v_trunc_f32_e32 v0, 1234
-// SICI: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX89: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
 // GFX11: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX89: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// SICI: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
 
 v_fract_f64_e32 v[0:1], 1234
-// SICI: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX89: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
 // GFX11: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// SICI: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
 
 v_trunc_f32_e64 v0, 1234
+// GFX11: v_trunc_f32_e64 v0, 0x4d2               ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
 // GFX12XX: v_trunc_f32_e64 v0, 0x4d2               ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:21: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:21: error: literal operands are not supported
-// GFX11: v_trunc_f32_e64 v0, 0x4d2               ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:21: error: literal operands are not supported
 
 v_fract_f64_e64 v[0:1], 1234
+// GFX11: v_fract_f64_e64 v[0:1], 0x4d2           ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
 // GFX12XX: v_fract_f64_e64 v[0:1], 0x4d2           ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported
-// GFX11: v_fract_f64_e64 v[0:1], 0x4d2           ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported
 
 v_trunc_f32_e32 v0, -54321
-// SICI: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 // GFX11: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// SICI: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 
 v_fract_f64_e32 v[0:1], -54321
-// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 // GFX11: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 
 v_trunc_f32_e32 v0, 0xdeadbeef
-// SICI: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
 // GFX11: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// SICI: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
 
 v_fract_f64_e32 v[0:1], 0xdeadbeef
-// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
 // GFX11: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
 
 v_trunc_f32_e32 v0, 0xffffffff
-// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 0xffffffff
-// SICI: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
-// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
 // GFX11: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
+// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff]
+// SICI: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
 
 v_trunc_f32_e32 v0, 0x123456789abcdef0
 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
 
 v_fract_f64_e32 v[0:1], 0x123456789abcdef0
-// NOSICI: :[[@LINE-1]]:25: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:25: error: invalid operand for instruction
 // GFX1250: v_fract_f64_e32 v[0:1], 0x123456789abcdef0 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12]
-// NOGFX11: :[[@LINE-4]]:25: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:25: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:25: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:25: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:25: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:25: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:25: error: invalid operand for instruction
 
 v_trunc_f32_e32 v0, 0xffffffffffffffff
-// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 0xffffffffffffffff
-// SICI: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, 1                   ; encoding: [0x81,0xc2,0x0a,0x7e]
 // GFX1250: v_cos_f16_e32 v5.l, 1                   ; encoding: [0x81,0xc2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00]
 // GFX1250: v_cos_f16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, 1                   ; encoding: [0x81,0x94,0x0a,0x7e]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, lit(0x1)            ; encoding: [0xff,0x94,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_trunc_f32_e32 v0, 1
-// SICI: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
 
 v_trunc_f32_e32 v0, lit(1)
-// SICI: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
 
 v_dot2_bf16_bf16 v5.l, v1, v2, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1        ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0x06,0x02]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_bf16_bf16 v5.l, v1, v2, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x1) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, 1, v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, 1, v2            ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c]
 // GFX12: v_dot2_f32_f16 v5, v1, 1, v2            ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, lit(1), v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, lit(0x1), v2     ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00]
 // GFX12: v_dot2_f32_f16 v5, v1, lit(0x1), v2     ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, 1                ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x1)         ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // int literal, expected int operand
@@ -755,111 +755,111 @@ s_mov_b64_e32 s[0:1], 0
 // SICI: s_mov_b64 s[0:1], 0                     ; encoding: [0x80,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 0, v1
-// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 0, v1
-// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], -13
 // GFX8PLUS: s_mov_b64 s[0:1], -13                   ; encoding: [0xcd,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], -13                   ; encoding: [0xcd,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, -13, v1
-// SICI: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, -13, v1
-// SICI: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 35
 // GFX8PLUS: s_mov_b64 s[0:1], 35                    ; encoding: [0xa3,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 35                    ; encoding: [0xa3,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 35, v1
-// SICI: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 35, v1
-// SICI: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 1234
 // GFX8PLUS: s_mov_b64 s[0:1], 0x4d2                 ; encoding: [0xff,0x01,0x80,0xbe,0xd2,0x04,0x00,0x00]
 // SICI: s_mov_b64 s[0:1], 0x4d2                 ; encoding: [0xff,0x04,0x80,0xbe,0xd2,0x04,0x00,0x00]
 
 v_and_b32_e32 v0, 1234, v1
-// SICI: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
-// GFX89: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
 // GFX11: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
+// GFX89: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00]
+// SICI: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
 
 v_and_b32_e64 v0, 1234, v1
+// GFX11: v_and_b32_e64 v0, 0x4d2, v1             ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
 // GFX12XX: v_and_b32_e64 v0, 0x4d2, v1             ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:19: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:19: error: literal operands are not supported
-// GFX11: v_and_b32_e64 v0, 0x4d2, v1             ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:19: error: literal operands are not supported
 
 s_mov_b64_e32 s[0:1], -54321
-// SICI: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff]
-// GFX89: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 // GFX11: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 // GFX12: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 // GFX1250: s_mov_b64 s[0:1], 0xffffffffffff2bcf    ; encoding: [0xfe,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff,0xff,0xff,0xff,0xff]
+// GFX89: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
+// SICI: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 
 v_and_b32_e32 v0, -54321, v1
-// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
-// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
 // GFX11: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
+// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff]
+// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
 
 s_mov_b64_e32 s[0:1], 0xdeadbeef
-// SICI: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde]
-// GFX89: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
 // GFX11: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
 // GFX12: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
 // GFX1250: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xfe,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
+// SICI: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde]
 
 v_and_b32_e32 v0, 0xdeadbeef, v1
-// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
-// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
 // GFX11: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
+// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde]
+// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
 
 s_mov_b64_e32 s[0:1], 0xffffffff
-// SICI: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff]
-// GFX89: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
 // GFX11: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
 // GFX12: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
 // GFX1250: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
+// SICI: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff]
 
 v_and_b32_e32 v0, 0xffffffff, v1
-// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 
 s_mov_b64_e32 s[0:1], 0x123456789abcdef0
-// NOSICI: :[[@LINE-1]]:23: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:23: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x123456789abcdef0    ; encoding: [0xfe,0x01,0x80,0xbe,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12]
-// NOGFX11: :[[@LINE-4]]:23: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:23: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:23: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:23: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:23: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:23: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 0x123456789abcdef0, v1
@@ -870,75 +870,75 @@ s_mov_b64_e32 s[0:1], 0xffffffffffffffff
 // SICI: s_mov_b64 s[0:1], -1                    ; encoding: [0xc1,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 0xffffffffffffffff, v1
-// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 
 v_not_b16 v5.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, 1                   ; encoding: [0x81,0xd2,0x0a,0x7e]
 // GFX1250: v_not_b16_e32 v5.l, 1                   ; encoding: [0x81,0xd2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_not_b16 v5.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00]
 // GFX1250: v_not_b16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 s_mov_b64 s[0:1], 1
 // GFX8PLUS: s_mov_b64 s[0:1], 1                     ; encoding: [0x81,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 1                     ; encoding: [0x81,0x04,0x80,0xbe]
 
 s_mov_b64 s[0:1], lit(1)
-// SICI: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00]
-// GFX89: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
 // GFX11: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
 // GFX12: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
 // GFX1250: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
+// SICI: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00]
 
 v_and_b32_e32 v0, 1, v1
-// SICI: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
 
 v_and_b32_e32 v0, lit(1), v1
-// SICI: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
-// GFX89: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00]
-// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
 // GFX11: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
+// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
+// GFX89: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00]
+// SICI: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
 
 v_pk_add_u16 v5, exec_lo, 1
+// GFX11: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
 // GFX12XX: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0x02,0x01,0x18]
-// GFX11: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_pk_add_u16 v5, exec_lo, lit(1)
-// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0x06,0x02]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x1) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // 1/(2*PI)
@@ -948,46 +948,46 @@ v_trunc_f32_e32 v0, 0x3fc45f306dc9c882
 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
 
 v_fract_f64_e32 v[0:1], 0x3fc45f306dc9c882
-// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
-// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction
 // GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
+// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction
 
 v_trunc_f32_e32 v0, 0x3e22f983
-// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 v_fract_f64_e32 v[0:1], 0x3e22f983
-// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 // GFX11: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 v_trunc_f32_e64 v0, 0x3fc45f306dc9c882
 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
 
 v_fract_f64_e64 v[0:1], 0x3fc45f306dc9c882
-// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction
 // GFX11: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction
 
 v_trunc_f32_e64 v0, 0x3e22f983
-// GFX89: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:21: error: literal operands are not supported
 // GFX11: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-2]]:21: error: literal operands are not supported
 
 v_fract_f64_e64 v[0:1], 0x3e22f983
+// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983      ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
 // GFX12XX: v_fract_f64_e64 v[0:1], 0x3e22f983      ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
-// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported
-// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983      ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
+// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported
 
 s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335
@@ -996,37 +996,37 @@ s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335
 // NOSICIVI: :[[@LINE-2]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 0.159154943091895317852646485335, v1
-// SICI: v_and_b32_e32 v0, 0x3e22f983, v1        ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e]
-// GFX89: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0x3e22f983, v1        ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e]
 
 v_and_b32_e64 v0, 0.159154943091895317852646485335, v1
-// GFX89: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
-// NOSICI: :[[@LINE-3]]:19: error: literal operands are not supported
 // GFX11: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-2]]:19: error: literal operands are not supported
 
 v_fract_f64 v[0:1], 0.159154943091895317852646485335
-// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30      ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
+// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
 // GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
 // NOSICI: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30      ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 0.159154943091895317852646485335
-// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 v_trunc_f32 v0, lit(0.159154943091895317852646485335)
-// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 // GFX11: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 //---------------------------------------------------------------------------//
 // integer literal truncation checks
@@ -1051,54 +1051,54 @@ v_trunc_f32 v0, 0x1fffffff000
 // NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction
 
 s_mov_b64 s[0:1], 0x101ffffffff
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x101ffffffff         ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x01,0x01,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 s_mov_b64 s[0:1], 0x1000000001
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x1000000001          ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x10,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 s_mov_b64 s[0:1], 0x1000000fff
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x1000000fff          ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0x0f,0x00,0x00,0x10,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_trunc_f64 v[0:1], 0x1fffffffff0
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
 // GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffffff0   ; encoding: [0xfe,0x2e,0x00,0x7e,0xf0,0xff,0xff,0xff,0xff,0x01,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
 
 v_trunc_f64 v[0:1], 0x100000001
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
 // GFX1250: v_trunc_f64_e32 v[0:1], 0x100000001     ; encoding: [0xfe,0x2e,0x00,0x7e,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
 
 v_trunc_f64 v[0:1], 0x1fffffff000
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
 // GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffff000   ; encoding: [0xfe,0x2e,0x00,0x7e,0x00,0xf0,0xff,0xff,0xff,0x01,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
 
 //---------------------------------------------------------------------------//
@@ -1106,210 +1106,210 @@ v_trunc_f64 v[0:1], 0x1fffffff000
 //---------------------------------------------------------------------------//
 
 buffer_atomic_add v0, off, s[0:3], scc offset:4095
-// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd]
-// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd]
-// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00]
 // GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xfd]
+// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00]
+// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd]
+// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd]
 
 s_add_i32 s0, vccz, s0
-// SICI: s_add_i32 s0, src_vccz, s0              ; encoding: [0xfb,0x00,0x00,0x81]
 // GFX89: s_add_i32 s0, src_vccz, s0              ; encoding: [0xfb,0x00,0x00,0x81]
-// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
+// SICI: s_add_i32 s0, src_vccz, s0              ; encoding: [0xfb,0x00,0x00,0x81]
 
 s_add_i32 s0, execz, s0
-// SICI: s_add_i32 s0, src_execz, s0             ; encoding: [0xfc,0x00,0x00,0x81]
 // GFX89: s_add_i32 s0, src_execz, s0             ; encoding: [0xfc,0x00,0x00,0x81]
-// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// SICI: s_add_i32 s0, src_execz, s0             ; encoding: [0xfc,0x00,0x00,0x81]
 
 s_add_i32 s0, scc, s0
-// SICI: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
-// GFX89: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
-// GFX12XX: s_add_co_i32 s0, src_scc, s0            ; encoding: [0xfd,0x00,0x00,0x81]
 // GFX11: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
+// GFX12XX: s_add_co_i32 s0, src_scc, s0            ; encoding: [0xfd,0x00,0x00,0x81]
+// GFX89: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
+// SICI: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
 
 s_and_b64 s[0:1], s[0:1], src_vccz
-// SICI: s_and_b64 s[0:1], s[0:1], src_vccz      ; encoding: [0x00,0xfb,0x80,0x87]
 // GFX89: s_and_b64 s[0:1], s[0:1], src_vccz      ; encoding: [0x00,0xfb,0x80,0x86]
-// NOGFX11: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:27: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU
+// SICI: s_and_b64 s[0:1], s[0:1], src_vccz      ; encoding: [0x00,0xfb,0x80,0x87]
 
 s_and_b64 s[0:1], s[0:1], src_execz
-// SICI: s_and_b64 s[0:1], s[0:1], src_execz     ; encoding: [0x00,0xfc,0x80,0x87]
 // GFX89: s_and_b64 s[0:1], s[0:1], src_execz     ; encoding: [0x00,0xfc,0x80,0x86]
-// NOGFX11: :[[@LINE-3]]:27: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:27: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:27: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_execz register not available on this GPU
+// SICI: s_and_b64 s[0:1], s[0:1], src_execz     ; encoding: [0x00,0xfc,0x80,0x87]
 
 s_and_b64 s[0:1], s[0:1], src_scc
-// SICI: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x87]
-// GFX89: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x86]
-// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x8b]
 // GFX11: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x8b]
+// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x8b]
+// GFX89: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x86]
+// SICI: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x87]
 
 v_add_u16 v0, vccz, v0
 // GFX89: v_add_u16_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x4c]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, src_scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xfd,0x06,0x86,0x06]
-// NOVI: :[[@LINE-3]]:20: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:20: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, v0, scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, v0, src_scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xfa,0x01,0x4c,0x00,0x06,0x06,0x86]
-// NOVI: :[[@LINE-3]]:24: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:24: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32 v0, execz, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e32 v0, src_execz, v0         ; encoding: [0xfc,0x00,0x00,0x68]
-// NOVI: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
-// NOGFX11: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32_e64 v0, scc, v0
+// GFX11: v_add_nc_u32_e64 v0, src_scc, v0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
 // GFX12XX: v_add_nc_u32_e64 v0, src_scc, v0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e64 v0, src_scc, v0           ; encoding: [0x00,0x00,0x34,0xd1,0xfd,0x00,0x02,0x00]
-// GFX11: v_add_nc_u32_e64 v0, src_scc, v0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_cmp_eq_i64 vcc, scc, v[0:1]
-// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1]   ; encoding: [0xfd,0x00,0x44,0x7d]
 // GFX89: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1]   ; encoding: [0xfd,0x00,0xc4,0x7d]
-// NOGFX11: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1]   ; encoding: [0xfd,0x00,0x44,0x7d]
 
 v_max_f16 v0, execz, v0
 // GFX89: v_max_f16_e32 v0, src_execz, v0         ; encoding: [0xfc,0x00,0x00,0x5a]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_max_f32 v0, vccz, v0
-// SICI: v_max_f32_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x20]
 // GFX89: v_max_f32_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x16]
-// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
+// SICI: v_max_f32_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x20]
 
 v_max_f64 v[0:1], scc, v[0:1]
-// SICI: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00]
-// GFX89: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00]
-// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c]
 // GFX11: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0x2a,0xd7,0xfd,0x00,0x02,0x00]
+// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c]
+// GFX89: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00]
+// SICI: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00]
 
 v_pk_add_f16 v0, execz, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_f16 v0, src_execz, v0          ; encoding: [0x00,0x40,0x8f,0xd3,0xfc,0x00,0x02,0x18]
-// NOVI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-4]]:18: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:18: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:18: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:18: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:18: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:18: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, neg(vccz)
 // GFX89: v_ceil_f16_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x85,0xd1,0xfb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, abs(scc)
-// GFX89: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00]
-// GFX12XX: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 // GFX11: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
+// GFX12XX: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
+// GFX89: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_ceil_f64 v[5:6], |execz|
-// GFX89: v_ceil_f64_e64 v[5:6], |src_execz|      ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00]
 // CI: v_ceil_f64_e64 v[5:6], |src_execz|      ; encoding: [0x05,0x01,0x30,0xd3,0xfc,0x00,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-4]]:21: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:21: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:21: error: src_execz register not available on this GPU
+// GFX89: v_ceil_f64_e64 v[5:6], |src_execz|      ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00]
+// NOGFX11: :[[@LINE-3]]:21: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-4]]:21: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:21: error: src_execz register not available on this GPU
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 
 v_ceil_f64 v[5:6], -vcc
-// GFX89: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20]
 // CI: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x30,0xd3,0x6a,0x00,0x00,0x20]
 // GFX11: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20]
 // GFX12: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20]
-// NOSI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:12: error: invalid operand for instruction
+// GFX89: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20]
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 
 v_ceil_f32 v0, -vccz
-// SICI: v_ceil_f32_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20]
 // GFX89: v_ceil_f32_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x5d,0xd1,0xfb,0x00,0x00,0x20]
-// NOGFX11: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:17: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:17: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU
+// SICI: v_ceil_f32_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20]
 
 v_ceil_f32 v0, |execz|
-// SICI: v_ceil_f32_e64 v0, |src_execz|          ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00]
 // GFX89: v_ceil_f32_e64 v0, |src_execz|          ; encoding: [0x00,0x01,0x5d,0xd1,0xfc,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-3]]:17: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:17: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:17: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:17: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:17: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:17: error: src_execz register not available on this GPU
+// SICI: v_ceil_f32_e64 v0, |src_execz|          ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00]
 
 v_ceil_f16_sdwa v5, |vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, |src_vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16_sdwa v5, -scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, -src_scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfd,0x16,0x96,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f32_sdwa v5, vccz dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, src_vccz dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfb,0x16,0x86,0x00]
-// NOVI: :[[@LINE-3]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:21: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, |src_execz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfc,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 //---------------------------------------------------------------------------//
@@ -1317,266 +1317,266 @@ v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD
 //---------------------------------------------------------------------------//
 
 buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095
-// NOSICI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU
-// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb]
 // GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xeb]
-// NOVI: :[[@LINE-4]]:36: error: src_shared_base register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
+// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb]
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:36: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:36: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU
 
 s_add_i32 s0, src_shared_base, s0
+// GFX11: s_add_i32 s0, src_shared_base, s0       ; encoding: [0xeb,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_shared_base, s0    ; encoding: [0xeb,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU
 // GFX9: s_add_i32 s0, src_shared_base, s0       ; encoding: [0xeb,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_shared_base, s0       ; encoding: [0xeb,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU
 
 s_add_i32 s0, src_shared_limit, s0
+// GFX11: s_add_i32 s0, src_shared_limit, s0      ; encoding: [0xec,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_shared_limit, s0   ; encoding: [0xec,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_limit register not available on this GPU
 // GFX9: s_add_i32 s0, src_shared_limit, s0      ; encoding: [0xec,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_shared_limit, s0      ; encoding: [0xec,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_limit register not available on this GPU
 
 s_add_i32 s0, src_private_base, s0
+// GFX11: s_add_i32 s0, src_private_base, s0      ; encoding: [0xed,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_private_base, s0   ; encoding: [0xed,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_private_base register not available on this GPU
 // GFX9: s_add_i32 s0, src_private_base, s0      ; encoding: [0xed,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_private_base, s0      ; encoding: [0xed,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_private_base register not available on this GPU
 
 s_add_i32 s0, src_private_limit, s0
+// GFX11: s_add_i32 s0, src_private_limit, s0     ; encoding: [0xee,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_private_limit, s0  ; encoding: [0xee,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_private_limit register not available on this GPU
 // GFX9: s_add_i32 s0, src_private_limit, s0     ; encoding: [0xee,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_private_limit, s0     ; encoding: [0xee,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_private_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_private_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_private_limit register not available on this GPU
 
 s_add_i32 s0, src_pops_exiting_wave_id, s0
-// NOSICI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU
 // GFX9: s_add_i32 s0, src_pops_exiting_wave_id, s0 ; encoding: [0xef,0x00,0x00,0x81]
-// NOVI: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOSICI: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOVI: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_shared_base
+// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_shared_base register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_shared_base register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_shared_limit
+// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_shared_limit register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_shared_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_shared_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_shared_limit register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_private_base
+// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_private_base register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_private_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_private_base register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_private_limit
+// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_private_limit register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_private_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_private_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_private_limit register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id
-// NOSICI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id ; encoding: [0x00,0xef,0x80,0x86]
-// NOVI: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOSICI: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOVI: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU
 
 v_add_u16 v0, src_shared_base, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x4c]
-// NOVI: :[[@LINE-3]]:15: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xeb,0x06,0x86,0x06]
-// NOVI: :[[@LINE-3]]:20: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xd6,0x01,0x4c,0x00,0x06,0x06,0x86]
-// NOVI: :[[@LINE-3]]:24: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:24: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32 v0, src_shared_base, v0
+// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
 // GFX12XX: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x68]
-// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32_e64 v0, src_shared_base, v0
+// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
 // GFX12XX: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e64 v0, src_shared_base, v0   ; encoding: [0x00,0x00,0x34,0xd1,0xeb,0x00,0x02,0x00]
-// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_cmp_eq_i64 vcc, src_shared_base, v[0:1]
-// NOSICI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
 // GFX9: v_cmp_eq_i64_e32 vcc, src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0xc4,0x7d]
-// NOVI: :[[@LINE-3]]:19: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
+// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:19: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
 
 v_max_f16 v0, src_shared_base, v0
+// GFX11: v_max_f16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x72]
 // GFX12XX: v_max_num_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x62]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_max_f16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x5a]
-// GFX11: v_max_f16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x72]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_max_f32 v0, src_shared_base, v0
+// GFX11: v_max_f32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x20]
 // GFX12XX: v_max_num_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x2c]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU
 // GFX9: v_max_f32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x16]
-// GFX11: v_max_f32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU
 
 v_max_f64 v[0:1], src_shared_base, v[0:1]
+// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00]
 // GFX12XX: v_max_num_f64_e32 v[0:1], src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0x00,0x1c]
-// NOSICI: :[[@LINE-2]]:19: error: src_shared_base register not available on this GPU
 // GFX9: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xeb,0x00,0x02,0x00]
-// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
 
 v_pk_add_f16 v0, src_shared_base, v0
+// GFX11: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
 // GFX12XX: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x8f,0xd3,0xeb,0x00,0x02,0x18]
-// GFX11: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, neg(src_shared_base)
+// GFX11: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
 // GFX12XX: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0x85,0xd1,0xeb,0x00,0x00,0x20]
-// GFX11: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, abs(src_shared_base)
+// GFX11: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
 // GFX12XX: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0x85,0xd1,0xeb,0x00,0x00,0x00]
-// GFX11: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f64 v[5:6], |src_shared_base|
-// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00]
 // GFX11: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00]
 // GFX12: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00]
-// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
-// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction
+// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00]
+// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU
 // NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
 
 v_ceil_f64 v[5:6], -src_shared_base
-// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20]
 // GFX11: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20]
 // GFX12: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20]
-// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
-// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction
+// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20]
+// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU
 // NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
 
 v_ceil_f32 v0, -src_shared_base
+// GFX11: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
 // GFX12XX: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
 // GFX9: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0x5d,0xd1,0xeb,0x00,0x00,0x20]
-// GFX11: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
 
 v_ceil_f32 v0, |src_shared_base|
+// GFX11: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
 // GFX12XX: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
 // GFX9: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0x5d,0xd1,0xeb,0x00,0x00,0x00]
-// GFX11: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
 
 v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0x96,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0x86,0x00]
-// NOVI: :[[@LINE-3]]:21: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 //---------------------------------------------------------------------------//
@@ -1584,206 +1584,206 @@ v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD
 //---------------------------------------------------------------------------//
 
 v_add_u32 v0, private_base, s0
-// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:29: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32 v0, scc, s0
-// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_add_nc_u32_e64 v0, src_scc, s0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, shared_base, v0, v1
-// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
-// NOSICI: :[[@LINE-2]]:20: error: src_shared_base register not available on this GPU
 // GFX11: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
-// NOVI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
+// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:20: error: src_shared_base register not available on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, shared_limit, v1
-// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
-// NOSICI: :[[@LINE-2]]:24: error: src_shared_limit register not available on this GPU
 // GFX11: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
-// NOVI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:24: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
+// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU
+// NOVI: :[[@LINE-5]]:24: error: src_shared_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:24: error: src_shared_limit register not available on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, v1, private_limit
-// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
-// NOSICI: :[[@LINE-2]]:28: error: src_private_limit register not available on this GPU
 // GFX11: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
-// NOVI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
+// NOGFX9: :[[@LINE-3]]:28: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU
+// NOVI: :[[@LINE-5]]:28: error: src_private_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:28: error: src_private_limit register not available on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, execz, v0, v1
-// NOSICI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:20: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:20: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:20: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:20: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:20: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:20: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:20: error: src_execz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions)
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, scc, v1
+// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1      ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
 // GFX12XX: v_div_fmas_f32 v0, v0, src_scc, v1      ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
-// NOSICI: :[[@LINE-2]]:24: error: invalid operand (violates constant bus restrictions)
 // NOGFX89: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
-// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1      ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
+// NOSICI: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:24: error: invalid operand (violates constant bus restrictions)
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, v1, vccz
-// NOSICI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:28: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:28: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:28: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:28: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:28: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:28: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions)
 
 // v_addc_co_u32 implicitly reads VCC (VOP2)
 v_addc_co_u32 v0, vcc, shared_base, v0, vcc
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX9: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madak_f32 v0, shared_base, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX9: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:17: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
 
 v_madak_f32 v0, scc, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:17: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions)
 
 v_madak_f32 v0, 0xff32ff, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
 
 v_madak_f32 v0, 0xff32ff, v0, 1
-// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
 
 v_madmk_f32 v0, 0xff32ff, 0x11213141, v0
-// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
 
 v_madmk_f32 v0, 0xff32ff, -1, v0
-// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
 
 v_madak_f16 v0, 0xff32, v0, 0x1122
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madak_f16 v0, 0xff32, v0, 0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madmk_f16 v0, 0xff32, 0x1122, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madmk_f16 v0, 0xff32, 1, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_cmp_eq_f32 s[0:1], private_base, private_limit
-// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction
+// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
 
 v_cmp_eq_f32 s[0:1], private_base, s0
-// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction
+// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
 
 v_cmp_eq_f32 s[0:1], execz, s0
-// NOSICI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:29: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:22: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:22: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:22: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:22: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:22: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:22: error: src_execz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions)
 
 v_pk_add_f16 v255, private_base, private_limit
-// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:34: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
+// NOGFX9: :[[@LINE-3]]:34: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_pk_add_f16 v255, vccz, execz
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-3]]:26: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:20: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:20: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
+// NOGFX9: :[[@LINE-4]]:26: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
@@ -1791,36 +1791,36 @@ v_pk_add_f16 v255, vccz, execz
 //---------------------------------------------------------------------------//
 
 v_sqrt_f32 v2, lit(123)
-// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX11: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 
 v_sqrt_f32 v2, abs(lit(123))
-// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX11: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 
 v_sqrt_f32 v2, lit(123.0)
-// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
 // GFX11: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
 
 v_sqrt_f64 v[2:3], lit(123.0)
-// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
-// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 // GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xfe,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40,0x00,0x00,0x00,0x00]
+// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40]
+// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 
 v_sqrt_f64 v[2:3], lit(123)
-// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xfe,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 
 v_sqrt_f32 v2, lit 123.0
 // NOGCN: :[[@LINE-1]]:20: error: expected left paren after lit
@@ -1834,16 +1834,16 @@ v_sqrt_f32 v2, lit(v1)
 // Make sure lit() is accepted on operands without modifiers.
 
 v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8)
-// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00]
 // GFX89: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x30,0xe8,0x07,0x00,0x00]
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00]
 
 v_madak_f32 v4, lit(lit(0x7e8)), v8, lit(0x7e8)
-// NOSICI: :[[@LINE-1]]:24: error: not a valid operand.
-// NOGFX89: :[[@LINE-2]]:24: error: not a valid operand.
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:24: error: not a valid operand.
+// NOSICI: :[[@LINE-5]]:24: error: not a valid operand.
 // NOSICIVI: :[[@LINE-1]]:24: error: not a valid operand.
diff --git a/llvm/test/ThinLTO/AArch64/aarch64_inline.ll b/llvm/test/ThinLTO/AArch64/aarch64_inline.ll
new file mode 100644
index 0000000..401f66d
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/aarch64_inline.ll
@@ -0,0 +1,86 @@
+;; Test verifies inlining happens cross module when module flags are upgraded.
+;; `foo` and `main` are both old semantic while bar is the new semantic.
+;; Regression test for #82763
+
+; RUN: split-file %s %t
+; RUN: opt -module-summary %t/foo.ll -o %t/foo.o
+; RUN: opt -module-summary %t/bar.ll -o %t/bar.o
+; RUN: opt -module-summary %t/main.ll -o %t/main.o
+; RUN: llvm-lto2 run %t/main.o %t/foo.o %t/bar.o -save-temps \
+; RUN:   -o %t/t.exe \
+; RUN:   -r=%t/foo.o,foo,plx \
+; RUN:   -r=%t/bar.o,bar,plx \
+; RUN:   -r=%t/main.o,foo,l \
+; RUN:   -r=%t/main.o,bar,l \
+; RUN:   -r=%t/main.o,main,plx 2>&1
+; RUN: llvm-dis %t/t.exe.1.4.opt.bc -o - | FileCheck %s
+
+; CHECK:      define dso_local noundef i32 @main() local_unnamed_addr #0 {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:  ret i32 35
+; CHECK-NEXT: }
+
+; CHECK:  attributes #0 = { {{.*}}"branch-target-enforcement" "sign-return-address"="all" "sign-return-address-key"="b_key" }
+
+; CHECK: !llvm.module.flags = !{!0, !1, !2, !3}
+
+; CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 2}
+; CHECK: !1 = !{i32 8, !"sign-return-address", i32 2}
+; CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 2}
+; CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 2}
+
+
+;--- foo.ll
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local noundef i32 @foo() local_unnamed_addr #0 {
+entry:
+  ret i32 34
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+!llvm.module.flags = !{!0, !1, !2, !3 }
+!0 = !{i32 8, !"branch-target-enforcement", i32 1}
+!1 = !{i32 8, !"sign-return-address", i32 1}
+!2 = !{i32 8, !"sign-return-address-all", i32 1}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 1}
+
+;--- bar.ll
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local noundef i32 @bar() local_unnamed_addr #0 {
+entry:
+  ret i32 1
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "branch-target-enforcement" "sign-return-address"="all" "sign-return-address-key"="b_key" }
+!llvm.module.flags = !{!0, !1, !2, !3 }
+!0 = !{i32 8, !"branch-target-enforcement", i32 2}
+!1 = !{i32 8, !"sign-return-address", i32 2}
+!2 = !{i32 8, !"sign-return-address-all", i32 2}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 2}
+
+;--- main.ll
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+declare i32 @foo();
+declare i32 @bar();
+
+define i32 @main() #0 {
+entry:
+  %1 = call i32 @foo()
+  %2 = call i32 @bar()
+  %3 = add i32 %1, %2
+  ret i32 %3
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1, !2, !3 }
+!0 = !{i32 8, !"branch-target-enforcement", i32 1}
+!1 = !{i32 8, !"sign-return-address", i32 1}
+!2 = !{i32 8, !"sign-return-address-all", i32 1}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 1}
diff --git a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
new file mode 100644
index 0000000..ae3c746
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
@@ -0,0 +1,338 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=dse -S %s | FileCheck %s
+
+define void @dead_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_unstrided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = load double, ptr %src
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_unstrided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %dst
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_strided_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_strided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 200, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 200, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_strided_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_strided_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 200, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 100, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 200, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %src
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %dst
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_dynamically_strided_store(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @dead_dynamically_strided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_dynamically_strided_store(ptr %ptr, i32 %stride) {
+; CHECK-LABEL: define void @live_dynamically_strided_store(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @dead_dynamically_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %src
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @live_dynamically_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %dst
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_unstrided_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_unstrided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_unstrided_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_unstrided_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_non_matrix_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_non_matrix_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 6
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[DST_OFFSET]], align 8
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst.offset = getelementptr inbounds double, ptr %src, i32 6
+  store double 42.0, ptr %dst.offset
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_non_matrix_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_non_matrix_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PTR_OFFSET:%.*]] = getelementptr inbounds double, ptr [[PTR]], i32 6
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[PTR_OFFSET]], align 8
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr.offset = getelementptr inbounds double, ptr %ptr, i32 6
+  store double 42.0, ptr %ptr.offset
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_unstrided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DST]], align 64
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  store <8 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @dead_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_strided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <16 x double> zeroinitializer, ptr [[DST]], align 128
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  store <16 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @live_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_unstrided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[DST]], align 32
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  store <4 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @live_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_strided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DST]], align 64
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  store <8 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @dead_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_dimension_change(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3)
+  ret void
+}
+
+define void @live_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_dimension_change(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/GVN/matrix-intrinsics.ll b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
new file mode 100644
index 0000000..78dbfe1
--- /dev/null
+++ b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=gvn -S %s | FileCheck %s
+
+define void @redundant_unstrided_load(ptr %src) {
+; CHECK-LABEL: define void @redundant_unstrided_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 8
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 8
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+}
+
+define void @redundant_unstrided_load_non_matrix_store(ptr %src) {
+; CHECK-LABEL: define void @redundant_unstrided_load_non_matrix_store(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 1
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[SRC]], align 8
+; CHECK-NEXT:    [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 1
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  store double 42.0, ptr %src
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+}
+
+define void @redundant_strided_load(ptr %src) {
+; CHECK-LABEL: define void @redundant_strided_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 16
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+
+}
+
+define void @redundant_strided_load_non_matrix_store(ptr %src) {
+; CHECK-LABEL: define void @redundant_strided_load_non_matrix_store(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[SRC]], align 8
+; CHECK-NEXT:    [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 16
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  store double 42.0, ptr %src
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+}
+
+define void @repeat_load_dimension_change_project(ptr %src) {
+; CHECK-LABEL: define void @repeat_load_dimension_change_project(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L_3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3)
+  %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.3)
+  ret void
+}
+
+define void @repeat_load_dimension_change_shuffle(ptr %src) {
+; CHECK-LABEL: define void @repeat_load_dimension_change_shuffle(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L_3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3)
+  %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.3)
+  ret void
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
+declare void @use(<8 x double>)
diff --git a/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll b/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll
index b3f2e81..15ce3e3 100644
--- a/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll
@@ -5,7 +5,7 @@
 ; attributes that should be transferred only if it is on all of the regions.
 
 ; This includes the attributes, no-nans-fp-math,
-; no-signed-zeros-fp-math, less-precise-fpmad, unsafe-fp-math, and
+; no-signed-zeros-fp-math, less-precise-fpmad, and
 ; no-infs-fp-math.  Only when each instance of similarity has these attributes
 ; can we say that the outlined function can have these attributes since that
 ; is the more general case for these attributes.
@@ -101,7 +101,7 @@ entry:
 }
 
 attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "less-precise-fpmad"="true"
-"unsafe-fp-math"="true" "no-infs-fp-math"="true"}
+"no-infs-fp-math"="true"}
 
 ; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) [[ATTR1:#[0-9]+]] {
 ; CHECK: entry_to_outline:
@@ -122,5 +122,5 @@ attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "les
 ; CHECK-NEXT:    [[CL:%.*]] = load i32, ptr [[ARG2]], align 4
 
 
-; CHECK: attributes [[ATTR1]] =   { minsize optsize "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "unsafe-fp-math"="false" }
-; CHECK: attributes [[ATTR]] = { minsize optsize "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" }
+; CHECK: attributes [[ATTR1]] =   { minsize optsize "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" }
+; CHECK: attributes [[ATTR]] = { minsize optsize "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/Transforms/Inline/attributes.ll b/llvm/test/Transforms/Inline/attributes.ll
index 55ab430..da7eeda 100644
--- a/llvm/test/Transforms/Inline/attributes.ll
+++ b/llvm/test/Transforms/Inline/attributes.ll
@@ -601,46 +601,6 @@ define i32 @test_no-signed-zeros-fp-math3(i32 %i) "no-signed-zeros-fp-math"="tru
 ; CHECK-NEXT: ret i32
 }
 
-define i32 @unsafe-fp-math_callee0(i32 %i) "unsafe-fp-math"="false" {
-  ret i32 %i
-; CHECK: @unsafe-fp-math_callee0(i32 %i) [[UNSAFE_FPMATH_FALSE:#[0-9]+]] {
-; CHECK-NEXT: ret i32
-}
-
-define i32 @unsafe-fp-math_callee1(i32 %i) "unsafe-fp-math"="true" {
-  ret i32 %i
-; CHECK: @unsafe-fp-math_callee1(i32 %i) [[UNSAFE_FPMATH_TRUE:#[0-9]+]] {
-; CHECK-NEXT: ret i32
-}
-
-define i32 @test_unsafe-fp-math0(i32 %i) "unsafe-fp-math"="false" {
-  %1 = call i32 @unsafe-fp-math_callee0(i32 %i)
-  ret i32 %1
-; CHECK: @test_unsafe-fp-math0(i32 %i) [[UNSAFE_FPMATH_FALSE]] {
-; CHECK-NEXT: ret i32
-}
-
-define i32 @test_unsafe-fp-math1(i32 %i) "unsafe-fp-math"="false" {
-  %1 = call i32 @unsafe-fp-math_callee1(i32 %i)
-  ret i32 %1
-; CHECK: @test_unsafe-fp-math1(i32 %i) [[UNSAFE_FPMATH_FALSE]] {
-; CHECK-NEXT: ret i32
-}
-
-define i32 @test_unsafe-fp-math2(i32 %i) "unsafe-fp-math"="true" {
-  %1 = call i32 @unsafe-fp-math_callee0(i32 %i)
-  ret i32 %1
-; CHECK: @test_unsafe-fp-math2(i32 %i) [[UNSAFE_FPMATH_FALSE]] {
-; CHECK-NEXT: ret i32
-}
-
-define i32 @test_unsafe-fp-math3(i32 %i) "unsafe-fp-math"="true" {
-  %1 = call i32 @unsafe-fp-math_callee1(i32 %i)
-  ret i32 %1
-; CHECK: @test_unsafe-fp-math3(i32 %i) [[UNSAFE_FPMATH_TRUE]] {
-; CHECK-NEXT: ret i32
-}
-
 ; Test that fn_ret_thunk_extern has no CompatRule; inlining is permitted.
 ; Test that fn_ret_thunk_extern has no MergeRule; fn_ret_thunk_extern is not
 ; propagated or dropped on the caller after inlining.
@@ -693,6 +653,4 @@ define i32 @loader_replaceable_caller() {
 ; CHECK: attributes [[NO_NANS_FPMATH_TRUE]] = { "no-nans-fp-math"="true" }
 ; CHECK: attributes [[NO_SIGNED_ZEROS_FPMATH_FALSE]] = { "no-signed-zeros-fp-math"="false" }
 ; CHECK: attributes [[NO_SIGNED_ZEROS_FPMATH_TRUE]] = { "no-signed-zeros-fp-math"="true" }
-; CHECK: attributes [[UNSAFE_FPMATH_FALSE]] = { "unsafe-fp-math"="false" }
-; CHECK: attributes [[UNSAFE_FPMATH_TRUE]] = { "unsafe-fp-math"="true" }
 ; CHECK: attributes [[FNRETTHUNK_EXTERN]] = { fn_ret_thunk_extern }
diff --git a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll b/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll
deleted file mode 100644
index 7816781..0000000
--- a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll
+++ /dev/null
@@ -1,243 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=instcombine | FileCheck %s
-@A = extern_weak global float, align 4
-
-; %same.as.v1 is a select with two phis %v1 and %phi.to.remove as the true
-; and false values, while %v1 and %phi.to.remove are actually the same.
-; Fold the selection instruction %same.as.v1 to %v1.
-define void @select_with_identical_phi(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]]
-; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT:    [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                    ; preds = %entry, %for.body
-  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
-  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
-  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
-  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
-  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
-  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
-  %q.load = load float, ptr %q
-  %c.load = load float, ptr %c
-  %sub = fsub float %q.load, %c.load
-  %cmp1 = fcmp olt float %sub, %v0
-  %v0.1 = select i1 %cmp1, float %sub, float %v0
-  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
-  %cmp2 = fcmp ogt float  %sub, %same.as.v1
-  %v1.1 = select i1 %cmp2, float %sub, float %v1
-  %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1
-  %inc.i = add nuw nsw i32 %i, 1
-  %q.next = getelementptr inbounds i8, ptr %q, i64 4
-  %c.next = getelementptr inbounds i8, ptr %c, i64 4
-  %exitcond = icmp eq i32 %inc.i, %count
-  br i1 %exitcond, label %exit, label %for.body
-
-exit:
-  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
-  store float %vl.1.lcssa, ptr @A
-  ret void
-}
-
-; The difference from select_with_identical_phi() is that the true and false values in
-; %phi.to.remove.next and %v1.1 are swapped.
-; Check that %same.as.v1 can be folded.
-define void @select_with_identical_phi_2(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi_2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]]
-; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT:    [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                    ; preds = %entry, %for.body
-  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
-  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
-  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
-  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
-  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
-  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
-  %q.load = load float, ptr %q
-  %c.load = load float, ptr %c
-  %sub = fsub float %q.load, %c.load
-  %cmp1 = fcmp olt float %sub, %v0
-  %v0.1 = select i1 %cmp1, float %sub, float %v0
-  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
-  %cmp2 = fcmp ogt float  %sub, %same.as.v1
-  %v1.1 = select i1 %cmp2, float %v1, float %sub
-  %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub
-  %inc.i = add nuw nsw i32 %i, 1
-  %q.next = getelementptr inbounds i8, ptr %q, i64 4
-  %c.next = getelementptr inbounds i8, ptr %c, i64 4
-  %exitcond = icmp eq i32 %inc.i, %count
-  br i1 %exitcond, label %exit, label %for.body
-
-exit:
-  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
-  store float %vl.1.lcssa, ptr @A
-  ret void
-}
-
-; The difference from select_with_identical_phi() is that the true and false values in
-; same.as.v1 are swapped.
-; Check that %same.as.v1 can be folded.
-define void @select_with_identical_phi_3(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi_3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]]
-; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT:    [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                    ; preds = %entry, %for.body
-  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
-  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
-  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
-  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
-  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
-  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
-  %q.load = load float, ptr %q
-  %c.load = load float, ptr %c
-  %sub = fsub float %q.load, %c.load
-  %cmp1 = fcmp olt float %sub, %v0
-  %v0.1 = select i1 %cmp1, float %sub, float %v0
-  %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1
-  %cmp2 = fcmp ogt float  %sub, %same.as.v1
-  %v1.1 = select i1 %cmp2, float %sub, float %v1
-  %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1
-  %inc.i = add nuw nsw i32 %i, 1
-  %q.next = getelementptr inbounds i8, ptr %q, i64 4
-  %c.next = getelementptr inbounds i8, ptr %c, i64 4
-  %exitcond = icmp eq i32 %inc.i, %count
-  br i1 %exitcond, label %exit, label %for.body
-
-exit:
-  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
-  store float %vl.1.lcssa, ptr @A
-  ret void
-}
-
-; The difference from select_with_identical_phi() is that the true and false values in
-; %same.as.v1, %phi.to.remove.next and %v1.1 are swapped.
-; Check that %same.as.v1 can be folded.
-define void @select_with_identical_phi_4(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi_4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]]
-; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT:    [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT:    [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:                                    ; preds = %entry, %for.body
-  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
-  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
-  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
-  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
-  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
-  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
-  %q.load = load float, ptr %q
-  %c.load = load float, ptr %c
-  %sub = fsub float %q.load, %c.load
-  %cmp1 = fcmp olt float %sub, %v0
-  %v0.1 = select i1 %cmp1, float %sub, float %v0
-  %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1
-  %cmp2 = fcmp ogt float  %sub, %same.as.v1
-  %v1.1 = select i1 %cmp2, float %v1, float %sub
-  %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub
-  %inc.i = add nuw nsw i32 %i, 1
-  %q.next = getelementptr inbounds i8, ptr %q, i64 4
-  %c.next = getelementptr inbounds i8, ptr %c, i64 4
-  %exitcond = icmp eq i32 %inc.i, %count
-  br i1 %exitcond, label %exit, label %for.body
-
-exit:
-  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
-  store float %vl.1.lcssa, ptr @A
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 11cc971..fb7890a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -417,21 +417,17 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1)
 define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @memory_dependence
 ; CHECK: vector.body:
 ; CHECK: %[[LOAD1:.*]] = load <4 x i32>
 ; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[LOAD3:.*]] = load <4 x i32>
-; CHECK: %[[LOAD4:.*]] = load <4 x i32>
-; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
-; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]])
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index 829acbbf..305a692 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -210,3 +210,175 @@ loop:
 exit:
   ret void
 }
+
+define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) {
+; IC1-LABEL: define void @test_masked_interleave_group(
+; IC1-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; IC1-NEXT:  [[ENTRY:.*:]]
+; IC1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; IC1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; IC1-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8)
+; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]]
+; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IC1:       [[VECTOR_MEMCHECK]]:
+; IC1-NEXT:    [[TMP4:%.*]] = zext i32 [[N]] to i64
+; IC1-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; IC1-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16
+; IC1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; IC1-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; IC1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]]
+; IC1-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
+; IC1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; IC1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]]
+; IC1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; IC1-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; IC1-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; IC1-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; IC1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; IC1-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IC1:       [[VECTOR_PH]]:
+; IC1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
+; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]]
+; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC1-NEXT:    [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
+; IC1-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
+; IC1-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; IC1-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
+; IC1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; IC1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
+; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC1:       [[VECTOR_BODY]]:
+; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; IC1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
+; IC1-NEXT:    [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
+; IC1-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]]
+; IC1-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]]
+; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]]
+; IC1-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer
+; IC1-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; IC1-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]]
+; IC1-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]])
+; IC1-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; IC1-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; IC1-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2
+; IC1-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3
+; IC1-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]])
+; IC1-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; IC1-NEXT:    call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
+; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; IC1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IC1:       [[MIDDLE_BLOCK]]:
+; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; IC1:       [[SCALAR_PH]]:
+;
+; CHECK-LABEL: define void @test_masked_interleave_group(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]]
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]]
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]])
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]])
+; CHECK-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %dst.iv = phi ptr [ %dst, %entry ], [ %dst.iv.next, %loop.latch ]
+  %src.iv = phi ptr [ %src, %entry ], [ %src.iv.next, %loop.latch ]
+  %mask.iv = phi ptr [ %mask, %entry ], [ %mask.iv.next, %loop.latch ]
+  %mask.iv.next = getelementptr i8, ptr %mask.iv, i64 1
+  %mask.val = load i8, ptr %mask.iv, align 1
+  %should.copy = icmp eq i8 %mask.val, 0
+  br i1 %should.copy, label %then, label %loop.latch
+
+then:
+  %elem0 = load float, ptr %src.iv, align 4
+  store float %elem0, ptr %dst.iv, align 4
+  %src.1.ptr = getelementptr i8, ptr %src.iv, i64 4
+  %s1 = load float, ptr %src.1.ptr, align 4
+  %dst.1.ptr = getelementptr i8, ptr %dst.iv, i64 4
+  store float %s1, ptr %dst.1.ptr, align 4
+  %src.2.ptr = getelementptr i8, ptr %src.iv, i64 8
+  %s2 = load float, ptr %src.2.ptr, align 4
+  %dst.2.ptr = getelementptr i8, ptr %dst.iv, i64 8
+  store float %s2, ptr %dst.2.ptr, align 4
+  %src.3.ptr = getelementptr i8, ptr %src.iv, i64 12
+  %s3 = load float, ptr %src.3.ptr, align 4
+  %dst.3.ptr = getelementptr i8, ptr %dst.iv, i64 12
+  store float %s3, ptr %dst.3.ptr, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i32 %iv, 1
+  %src.iv.next = getelementptr i8, ptr %src.iv, i64 16
+  %dst.iv.next = getelementptr i8, ptr %dst.iv, i64 16
+  %ec = icmp eq i32 %iv, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index c8d20dc..e42e2c7 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -7,6 +7,7 @@ target triple = "wasm32-unknown-wasi"
 %struct.TwoInts = type { i32, i32 }
 %struct.ThreeInts = type { i32, i32, i32 }
 %struct.FourInts = type { i32, i32, i32, i32 }
+%struct.TwoShorts = type { i16, i16 }
 %struct.ThreeShorts = type { i16, i16, i16 }
 %struct.FourShorts = type { i16, i16, i16, i16 }
 %struct.TwoBytes = type { i8, i8 }
@@ -14,6 +15,8 @@ target triple = "wasm32-unknown-wasi"
 %struct.FourBytes = type { i8, i8, i8, i8 }
 %struct.FiveBytes = type { i8, i8, i8, i8, i8 }
 %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.TwoFloats = type { float, float }
+%struct.FourFloats = type { float, float, float, float }
 
 ; CHECK-LABEL: two_ints_same_op
 ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10
@@ -1350,3 +1353,1000 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
 34:                                               ; preds = %6, %4
   ret void
 }
+
+; CHECK-LABEL: two_floats_same_op
+; CHECK: LV: Scalar loop costs: 14
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 14.
+; CHECK: LV: Vector loop of width 2 costs: 19.
+; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Selecting VF: 1.
+define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_vary_op
+; CHECK: LV: Scalar loop costs: 14
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 14.
+; CHECK: LV: Vector loop of width 2 costs: 19.
+; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Selecting VF: 1.
+define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp20.not = icmp eq i32 %N, 0
+  br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %inc = add nuw i32 %i.021, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 21
+; CHECK: LV: Vector loop of width 4 costs: 14.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 21
+; CHECK: LV: Vector loop of width 4 costs: 14.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_same_op
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 22
+; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_vary_op
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 22
+; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 20
+; CHECK: LV: Vector loop of width 4 costs: 13.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 20
+; CHECK: LV: Vector loop of width 4 costs: 13.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 24
+; CHECK: LV: Vector loop of width 2 costs: 33
+; CHECK: LV: Vector loop of width 4 costs: 30
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %mul14 = fmul float %4, %5
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul14, ptr %z16, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %mul20 = fmul float %6, %7
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %mul20, ptr %w22, align 4
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 24
+; CHECK: LV: Vector loop of width 2 costs: 33
+; CHECK: LV: Vector loop of width 4 costs: 30
+; CHECK: LV: Selecting VF: 1
+define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp42.not = icmp eq i32 %N, 0
+  br i1 %cmp42.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z12, align 4
+  %mul = fmul float %4, %5
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul, ptr %z14, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w17, align 4
+  %div = fdiv float %6, %7
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %div, ptr %w19, align 4
+  %inc = add nuw i32 %i.043, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 43
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv15 = sitofp i8 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z17, align 1
+  %conv18 = sitofp i8 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv23 = sitofp i8 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w25, align 1
+  %conv26 = sitofp i8 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 43
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv14 = sitofp i8 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z16, align 1
+  %conv17 = sitofp i8 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv21 = sitofp i8 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w23, align 1
+  %conv24 = sitofp i8 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 38
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i8
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv16, ptr %z18, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i8
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv23, ptr %w25, align 1
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 38
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i8
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv14, ptr %z16, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i8
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv20, ptr %w22, align 1
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 37
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv15 = sitofp i16 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z17, align 2
+  %conv18 = sitofp i16 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv23 = sitofp i16 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w25, align 2
+  %conv26 = sitofp i16 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 37
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv14 = sitofp i16 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z16, align 2
+  %conv17 = sitofp i16 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv21 = sitofp i16 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w23, align 2
+  %conv24 = sitofp i16 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 35
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i16
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv16, ptr %z18, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i16
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv23, ptr %w25, align 2
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 35
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i16
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv14, ptr %z16, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i16
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv20, ptr %w22, align 2
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
new file mode 100644
index 0000000..01934b1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
@@ -0,0 +1,30 @@
+; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Make sure the unsafe user specified interleave count is ignored.
+
+; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-LABEL: @loop_distance_4
+define void @loop_distance_4(ptr %a, ptr %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ]
+  %0 = getelementptr i32, ptr %b, i64 %iv
+  %arrayidx = getelementptr i8, ptr %0, i64 -16
+  %1 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, ptr %0, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 64
+  br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2, !3}
+!2 = !{!"llvm.loop.interleave.count", i32 4}
+!3 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll
index a3cf23b..f793814 100644
--- a/llvm/test/Transforms/SCCP/conditions-ranges.ll
+++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll
@@ -1547,3 +1547,28 @@ bb2:
   call void @use(i1 %c4)
   ret void
 }
+
+define i1 @and_predicate_dominating_phi(i32 %x) {
+; CHECK-LABEL: @and_predicate_dominating_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK:       nope:
+; CHECK-NEXT:    br label [[PHI]]
+; CHECK:       phi:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %xge1 = icmp uge i32 %x, 1
+  %xlt2 = icmp ult i32 %x, 2
+  %and = and i1 %xge1, %xlt2
+  br i1 %and, label %phi, label %nope
+nope:
+  br label %phi
+phi:
+  %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+  %ret = icmp uge i32 %res, 1
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll
index 0f18dc2..46e38d9 100644
--- a/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll
+++ b/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -enable-unsafe-fp-math -S >%t
+; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -S >%t
 ; RUN: FileCheck %s < %t
 ; ModuleID = 't.cc'
 
diff --git a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll
index c7bc43e1..b61d659 100644
--- a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll
+++ b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=powerpc64le-unknown-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -enable-unsafe-fp-math -S | \
+; RUN: opt < %s -mtriple=powerpc64le-unknown-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -S | \
 ; RUN: FileCheck %s
 
 ; This case is copied from test/Transforms/SimplifyCFG/AArch64/
diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
index 2e96a92..cc1dc4e 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -994,3 +994,30 @@ define void @test_assume_deep_and_tree(i1 %a1) {
   call void @foo(i1 %a15)
   ret void
 }
+
+define i32 @test_and_with_phinode(i32 %x) {
+; CHECK-LABEL: @test_and_with_phinode(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK:         [[X_0_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK:         [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT:    br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK:       nope:
+; CHECK-NEXT:    br label [[PHI]]
+; CHECK:       phi:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %xge1 = icmp uge i32 %x, 1
+  %xlt2 = icmp ult i32 %x, 2
+  %and = and i1 %xge1, %xlt2
+  br i1 %and, label %phi, label %nope
+nope:
+  br label %phi
+phi:
+  %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll
new file mode 100644
index 0000000..10566ae
--- /dev/null
+++ b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll
@@ -0,0 +1,132 @@
+; -stats requires asserts
+; REQUIRES: asserts
+
+; Check that we can still devirtualize outside LTO mode when speculative devirtualization is enabled.
+; Check that we skip devirtualization for empty functions in speculative devirtualization mode
+
+; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively \
+; RUN: -pass-remarks=wholeprogramdevirt -stats %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: remark: devirt-single.cc:30:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:41:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:51:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:13:0: devirtualized vf
+; CHECK-NOT: devirtualized
+
+@vt1 = constant [1 x ptr] [ptr @vf], !type !8
+@vt2 = constant [1 x ptr] [ptr @vf_empty], !type !12
+
+define i1 @vf(ptr %this) #0 !dbg !7 {
+  ret i1 true
+}
+
+; This should NOT be devirtualized because during non-lto empty functions
+; are skipped.
+define void @vf_empty(ptr %this) !dbg !11 {
+  ret void
+}
+
+; CHECK: define void @call
+define void @call(ptr %obj) #1 !dbg !5 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.public.type.test(ptr %vtable, metadata !"typeid")
+  call void @llvm.assume(i1 %p)
+  %fptr = load ptr, ptr %vtable
+  ; CHECK: if.true.direct_targ:
+  ; CHECK:   call i1 @vf(
+  ; CHECK: if.false.orig_indirect:
+  ; CHECK:   call i1 %fptr(
+  call i1 %fptr(ptr %obj), !dbg !6
+  ret void
+}
+
+
+; CHECK: define void @call1
+define void @call1(ptr %obj) #1 !dbg !9 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptr = load ptr, ptr %vtable, align 8
+  ; CHECK: call i1 %fptr
+  %1 = call i1 %fptr(ptr %obj), !dbg !10
+  ret void
+}
+declare ptr @llvm.load.relative.i32(ptr, i32)
+
+@vt3 = private unnamed_addr constant [1 x i32] [
+  i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr @vt3 to i64)) to i32)
+], align 4, !type !15
+
+; CHECK: define void @call2
+define void @call2(ptr %obj) #1 !dbg !13 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid2")
+  call void @llvm.assume(i1 %p)
+  %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 0)
+  ; CHECK: if.true.direct_targ:
+  ; CHECK:   call i1 @vf(
+  ; CHECK: if.false.orig_indirect:
+  ; CHECK:   call i1 %fptr(
+  call i1 %fptr(ptr %obj), !dbg !14
+  ret void
+}
+
+@_ZTV1A.local = private unnamed_addr constant { [3 x i32] } { [3 x i32] [
+  i32 0,  ; offset to top
+  i32 0,  ; rtti
+  i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32)  ; vf_emptyunc offset
+] }, align 4, !type !18
+
+; CHECK: define void @call3
+define void @call3(ptr %obj) #1 !dbg !16 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid3")
+  call void @llvm.assume(i1 %p)
+  %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8)
+  ; CHECK: if.true.direct_targ:
+  ; CHECK:   call i1 @vf(
+  ; CHECK: if.false.orig_indirect:
+  ; CHECK:   call i1 %fptr(
+  call i1 %fptr(ptr %obj), !dbg !17
+  ret void
+}
+
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare i1 @llvm.public.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0 (trunk 278098)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "devirt-single.cc", directory: ".")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{!"clang version 4.0.0 (trunk 278098)"}
+!5 = distinct !DISubprogram(name: "call", linkageName: "_Z4callPv", scope: !1, file: !1, line: 29, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!6 = !DILocation(line: 30, column: 32, scope: !5)
+!7 = distinct !DISubprogram(name: "vf", linkageName: "_ZN3vt12vfEv", scope: !1, file: !1, line: 13, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!8 = !{i32 0, !"typeid"}
+
+!9 = distinct !DISubprogram(name: "call1", linkageName: "_Z5call1Pv", scope: !1, file: !1, line: 31, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!10 = !DILocation(line: 35, column: 32, scope: !9)
+!11 = distinct !DISubprogram(name: "vf_empty", linkageName: "_ZN3vt18vf_emptyEv", scope: !1, file: !1, line: 23, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!12 = !{i32 0, !"typeid1"}
+
+!13 = distinct !DISubprogram(name: "call2", linkageName: "_Z5call2Pv", scope: !1, file: !1, line: 40, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!14 = !DILocation(line: 41, column: 32, scope: !13)
+!15 = !{i32 0, !"typeid2"}
+
+!16 = distinct !DISubprogram(name: "call3", linkageName: "_Z5call3Pv", scope: !1, file: !1, line: 50, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!17 = !DILocation(line: 51, column: 32, scope: !16)
+!18 = !{i32 0, !"typeid3"}
+
+
+
+; CHECK: 1 wholeprogramdevirt - Number of whole program devirtualization targets
+; CHECK: 3 wholeprogramdevirt - Number of single implementation devirtualizations
diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
index d8f5c91..8327e1c 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
@@ -11,6 +11,9 @@
 ; Check wildcard
 ; RUN: opt -S -passes=wholeprogramdevirt -whole-program-visibility -pass-remarks=wholeprogramdevirt -wholeprogramdevirt-skip=vf?i1 %s 2>&1 | FileCheck %s --check-prefix=SKIP
 
+; Check that no stats are reported in speculative devirtualization mode as the virtual const prop is disabled.
+; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively -stats %s 2>&1 | FileCheck %s --check-prefix=CHECK-SPECULATIVE-WPD
+
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -225,3 +228,7 @@ declare ptr @llvm.load.relative.i32(ptr, i32)
 ; CHECK: 2 wholeprogramdevirt - Number of unique return value optimizations
 ; CHECK: 2 wholeprogramdevirt - Number of virtual constant propagations
 ; CHECK: 2 wholeprogramdevirt - Number of 1 bit virtual constant propagations
+
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of unique return value optimizations
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of virtual constant propagations
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of 1 bit virtual constant propagations
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
index 1ffe533..d1df304 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
@@ -1403,8 +1403,8 @@ vzeroupper
 # CHECK-NEXT:  1      8     0.50    *                   vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpblendw	$11, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpblendw	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpeqb	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpeqb	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpeqd	%xmm0, %xmm1, %xmm2
@@ -1415,8 +1415,8 @@ vzeroupper
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpeqw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  8      6     3.00                        vpcmpestri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  12     13    3.00    *                   vpcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT:  7      6     3.00                        vpcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  12     13    3.00    *                   vpcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      7     3.00                        vpcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  12     14    3.00    *                   vpcmpestrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpgtb	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpgtb	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpgtd	%xmm0, %xmm1, %xmm2
@@ -1427,8 +1427,8 @@ vzeroupper
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpgtw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      2     2.00                        vpcmpistri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   vpcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  3      6     2.00                        vpcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      13    2.00    *                   vpcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      7     2.00                        vpcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      14    2.00    *                   vpcmpistrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        vperm2f128	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      10    1.00    *                   vperm2f128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpermilpd	$1, %xmm0, %xmm2
@@ -1749,7 +1749,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 1.33   1.33   1.33   16.50  16.50  16.50  16.50   -     205.25 393.58 268.08 158.08 208.50 208.50 65.00  119.67 119.67 119.67 107.00 107.00 107.00 19.00  19.00
+# CHECK-NEXT: 1.33   1.33   1.33   16.50  16.50  16.50  16.50   -     204.25 392.58 268.08 158.08 208.50 208.50 65.00  119.67 119.67 119.67 107.00 107.00 107.00 19.00  19.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2126,8 +2126,8 @@ vzeroupper
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -     vpblendw	$11, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpblendw	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -     vpcmpeqb	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpcmpeqb	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -     vpcmpeqd	%xmm0, %xmm1, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
index 6dc5bac..6c8fac4 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
@@ -560,14 +560,14 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpgtw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vperm2i128	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      8     1.00    *                   vperm2i128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      5     1.00                        vpermd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      12    2.00    *                   vpermd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      6     1.00                        vpermpd	$1, %ymm0, %ymm2
-# CHECK-NEXT:  3      13    2.00    *                   vpermpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  2      7     1.00                        vpermps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  3      14    2.00    *                   vpermps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      6     1.00                        vpermq	$1, %ymm0, %ymm2
-# CHECK-NEXT:  2      12    2.00    *                   vpermq	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermq	$1, %ymm0, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermq	$1, (%rax), %ymm2
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherdd	%xmm0, (%rax,%xmm1,2), %xmm2
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherdd	%ymm0, (%rax,%ymm1,2), %ymm2
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherdq	%xmm0, (%rax,%xmm1,2), %xmm2
@@ -789,7 +789,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 6.67   6.67   6.67    -      -      -      -      -     93.75  132.75 92.25  36.25  80.50  80.50  29.00  52.33  52.33  52.33  50.67  50.67  50.67  2.50   2.50
+# CHECK-NEXT: 6.67   6.67   6.67    -      -      -      -      -     93.75  128.75 92.25  36.25  80.50  80.50  29.00  52.33  52.33  52.33  50.67  50.67  50.67  2.50   2.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -894,13 +894,13 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vperm2i128	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vperm2i128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermpd	$1, %ymm0, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermpd	$1, (%rax), %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermpd	$1, (%rax), %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermq	$1, %ymm0, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermq	$1, (%rax), %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermq	$1, (%rax), %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherdd	%xmm0, (%rax,%xmm1,2), %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherdd	%ymm0, (%rax,%ymm1,2), %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherdq	%xmm0, (%rax,%xmm1,2), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
index 72d7de3..14b8e5f 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
@@ -1207,7 +1207,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      3     1.00                        vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      10    1.00    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      10    1.00    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignd	$1, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      2     0.50                        valignd	$1, %zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax), %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax){1to16}, %zmm17, %zmm19
 # CHECK-NEXT:  1      1     1.00                        valignd	$1, %zmm16, %zmm17, %zmm19 {%k1}
@@ -1216,7 +1216,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      1     1.00                        valignd	$1, %zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignq	$1, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      2     0.50                        valignq	$1, %zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignq	$1, (%rax), %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignq	$1, (%rax){1to8}, %zmm17, %zmm19
 # CHECK-NEXT:  1      1     1.00                        valignq	$1, %zmm16, %zmm17, %zmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
index 552b3e4..ead609e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
@@ -1948,7 +1948,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      3     0.50                        vaddps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      10    0.50    *                   vaddps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      10    0.50    *                   vaddps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      3     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to4}, %xmm17, %xmm19
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19 {%k1}
@@ -1957,7 +1957,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignd	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     1.00                        valignd	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to8}, %ymm17, %ymm19
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -1966,7 +1966,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      3     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax), %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax){1to2}, %xmm17, %xmm19
 # CHECK-NEXT:  1      1     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19 {%k1}
@@ -1975,7 +1975,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      1     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignq	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     1.00                        valignq	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax){1to4}, %ymm17, %ymm19
 # CHECK-NEXT:  1      1     0.50                        valignq	$1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -3614,7 +3614,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 10.67  10.67  10.67   -      -      -      -      -     208.00 1083.00 636.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00  32.00
+# CHECK-NEXT: 10.67  10.67  10.67   -      -      -      -      -     208.00 1084.00 637.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00  32.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -3663,7 +3663,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax){1to8}, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -3681,7 +3681,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax){1to4}, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %ymm16, %ymm17, %ymm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
index 87ba060..d1f2a98 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
@@ -13,8 +13,8 @@ vpclmulqdq    $11, (%rax), %zmm17, %zmm19
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %zmm17, %zmm19
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -43,9 +43,9 @@ vpclmulqdq    $11, (%rax), %zmm17, %zmm19
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.00   4.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %zmm17, %zmm19
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
index 3c80c56..ea7a280 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
@@ -16,10 +16,10 @@ vpclmulqdq    $11, (%rax), %ymm17, %ymm19
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %xmm17, %xmm19
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %ymm17, %ymm19
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %xmm17, %xmm19
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %ymm17, %ymm19
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -48,11 +48,11 @@ vpclmulqdq    $11, (%rax), %ymm17, %ymm19
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     8.00   8.00    -      -     1.00   1.00    -     0.67   0.67   0.67   0.67   0.67   0.67    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     6.00   6.00    -      -     1.00   1.00    -     0.67   0.67   0.67   0.67   0.67   0.67    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm17, %xmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm17, %ymm19
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
index f4888cf..afbd566 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
@@ -69,12 +69,12 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT:  2      5     0.33    *                   blsrl	(%rax), %ecx
 # CHECK-NEXT:  1      1     0.25                        blsrq	%rax, %rcx
 # CHECK-NEXT:  2      5     0.33    *                   blsrq	(%rax), %rcx
-# CHECK-NEXT:  2      2     1.00                        tzcntw	%ax, %cx
-# CHECK-NEXT:  2      6     0.50    *                   tzcntw	(%rax), %cx
-# CHECK-NEXT:  2      2     0.50                        tzcntl	%eax, %ecx
-# CHECK-NEXT:  2      6     0.50    *                   tzcntl	(%rax), %ecx
-# CHECK-NEXT:  2      2     0.50                        tzcntq	%rax, %rcx
-# CHECK-NEXT:  2      6     0.50    *                   tzcntq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.25                        tzcntw	%ax, %cx
+# CHECK-NEXT:  1      5     0.50    *                   tzcntw	(%rax), %cx
+# CHECK-NEXT:  1      1     0.50                        tzcntl	%eax, %ecx
+# CHECK-NEXT:  1      5     0.50    *                   tzcntl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.50                        tzcntq	%rax, %rcx
+# CHECK-NEXT:  1      5     0.50    *                   tzcntq	(%rax), %rcx
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -103,7 +103,7 @@ tzcnt       (%rax), %rcx
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 4.33   4.33   4.33   5.00   9.50   9.50   5.00    -      -      -      -      -      -      -      -     4.33   4.33   4.33   4.33   4.33   4.33    -      -
+# CHECK-NEXT: 4.33   4.33   4.33   4.25   8.75   8.75   4.25    -      -      -      -      -      -      -      -     4.33   4.33   4.33   4.33   4.33   4.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -127,7 +127,7 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT: 0.33   0.33   0.33   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     blsrl	(%rax), %ecx
 # CHECK-NEXT:  -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsrq	%rax, %rcx
 # CHECK-NEXT: 0.33   0.33   0.33   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     blsrq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -     1.00   1.00   1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntw	%ax, %cx
+# CHECK-NEXT:  -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntw	%ax, %cx
 # CHECK-NEXT: 0.33   0.33   0.33    -     0.50   0.50    -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     tzcntw	(%rax), %cx
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntl	%eax, %ecx
 # CHECK-NEXT: 0.33   0.33   0.33    -     0.50   0.50    -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     tzcntl	(%rax), %ecx
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
index 64feeaf..26a42fd 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
@@ -15,10 +15,10 @@ lock cmpxchg16b (%rax)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  19     3     6.00    *      *            cmpxchg8b	(%rax)
-# CHECK-NEXT:  28     4     14.75   *      *            cmpxchg16b	(%rax)
-# CHECK-NEXT:  19     3     6.00    *      *            lock		cmpxchg8b	(%rax)
-# CHECK-NEXT:  28     4     14.75   *      *            lock		cmpxchg16b	(%rax)
+# CHECK-NEXT:  15     3     5.00    *      *            cmpxchg8b	(%rax)
+# CHECK-NEXT:  26     2     10.00   *      *            cmpxchg16b	(%rax)
+# CHECK-NEXT:  15     3     5.00    *      *            lock		cmpxchg8b	(%rax)
+# CHECK-NEXT:  26     2     10.00   *      *            lock		cmpxchg16b	(%rax)
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -47,11 +47,11 @@ lock cmpxchg16b (%rax)
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -     41.50  41.50  41.50  41.50   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT:  -      -      -     30.00  30.00  30.00  30.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -     6.00   6.00   6.00   6.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg8b	(%rax)
-# CHECK-NEXT:  -      -      -     14.75  14.75  14.75  14.75   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg16b	(%rax)
-# CHECK-NEXT:  -      -      -     6.00   6.00   6.00   6.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg8b	(%rax)
-# CHECK-NEXT:  -      -      -     14.75  14.75  14.75  14.75   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg16b	(%rax)
+# CHECK-NEXT:  -      -      -     5.00   5.00   5.00   5.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg8b	(%rax)
+# CHECK-NEXT:  -      -      -     10.00  10.00  10.00  10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg16b	(%rax)
+# CHECK-NEXT:  -      -      -     5.00   5.00   5.00   5.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg8b	(%rax)
+# CHECK-NEXT:  -      -      -     10.00  10.00  10.00  10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg16b	(%rax)
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
index a36fb2aa..fc2bc8e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
@@ -13,8 +13,8 @@ pclmulqdq     $11, (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        pclmulqdq	$11, %xmm0, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   pclmulqdq	$11, (%rax), %xmm2
+# CHECK-NEXT:  4      4     1.50                        pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT:  4      11    1.50    *                   pclmulqdq	$11, (%rax), %xmm2
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -43,9 +43,9 @@ pclmulqdq     $11, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.00   4.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     pclmulqdq	$11, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     pclmulqdq	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     pclmulqdq	$11, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
index 015d37e..ae60835 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
@@ -52,12 +52,12 @@ pcmpgtq     (%rax), %xmm2
 # CHECK-NEXT:  1      7     1.00    *                   crc32q	(%rax), %rcx
 # CHECK-NEXT:  8      6     3.00                        pcmpestri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  12     13    3.00    *                   pcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT:  7      6     3.00                        pcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  12     13    3.00    *                   pcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      7     3.00                        pcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  12     14    3.00    *                   pcmpestrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  4      2     2.00                        pcmpistri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   pcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  3      6     2.00                        pcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      13    2.00    *                   pcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      7     2.00                        pcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      14    2.00    *                   pcmpistrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        pcmpgtq	%xmm0, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   pcmpgtq	(%rax), %xmm2
 
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
index 55a36d0..dca4703 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
@@ -13,8 +13,8 @@ vpclmulqdq    $11, (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %ymm1, %ymm3
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -43,9 +43,9 @@ vpclmulqdq    $11, (%rax), %ymm1, %ymm3
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.00   4.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm1, %ymm3
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
index 9c5b4e4..886d9c6 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
@@ -1173,18 +1173,18 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      6     0.67    *      *            andq	%rsi, (%rax)
 # CHECK-NEXT:  1      6     0.67    *      *            lock		andq	%rsi, (%rax)
 # CHECK-NEXT:  1      5     0.33    *                   andq	(%rax), %rdi
-# CHECK-NEXT:  6      1     1.00                        bsfw	%si, %di
-# CHECK-NEXT:  6      1     1.00                        bsrw	%si, %di
-# CHECK-NEXT:  7      5     1.00    *                   bsfw	(%rax), %di
-# CHECK-NEXT:  7      5     1.00    *                   bsrw	(%rax), %di
-# CHECK-NEXT:  6      1     1.00                        bsfl	%esi, %edi
-# CHECK-NEXT:  6      1     1.00                        bsrl	%esi, %edi
-# CHECK-NEXT:  7      5     1.00    *                   bsfl	(%rax), %edi
-# CHECK-NEXT:  7      5     1.00    *                   bsrl	(%rax), %edi
-# CHECK-NEXT:  6      1     1.00                        bsfq	%rsi, %rdi
-# CHECK-NEXT:  6      1     1.00                        bsrq	%rsi, %rdi
-# CHECK-NEXT:  7      5     1.00    *                   bsfq	(%rax), %rdi
-# CHECK-NEXT:  7      5     1.00    *                   bsrq	(%rax), %rdi
+# CHECK-NEXT:  1      1     1.00                        bsfw	%si, %di
+# CHECK-NEXT:  1      1     1.00                        bsrw	%si, %di
+# CHECK-NEXT:  2      5     1.00    *                   bsfw	(%rax), %di
+# CHECK-NEXT:  2      5     1.00    *                   bsrw	(%rax), %di
+# CHECK-NEXT:  1      1     1.00                        bsfl	%esi, %edi
+# CHECK-NEXT:  1      1     1.00                        bsrl	%esi, %edi
+# CHECK-NEXT:  2      5     1.00    *                   bsfl	(%rax), %edi
+# CHECK-NEXT:  2      5     1.00    *                   bsrl	(%rax), %edi
+# CHECK-NEXT:  1      1     1.00                        bsfq	%rsi, %rdi
+# CHECK-NEXT:  1      1     1.00                        bsrq	%rsi, %rdi
+# CHECK-NEXT:  2      5     1.00    *                   bsfq	(%rax), %rdi
+# CHECK-NEXT:  2      5     1.00    *                   bsrq	(%rax), %rdi
 # CHECK-NEXT:  1      1     0.25                        bswapl	%eax
 # CHECK-NEXT:  1      1     0.25                        bswapq	%rax
 # CHECK-NEXT:  1      1     0.50                        btw	%si, %di
@@ -1321,23 +1321,23 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      1     0.25                        decq	%rdi
 # CHECK-NEXT:  1      6     0.67    *      *            decq	(%rax)
 # CHECK-NEXT:  1      6     0.67    *      *            lock		decq	(%rax)
-# CHECK-NEXT:  2      10    10.00                 U     divb	%dil
-# CHECK-NEXT:  2      14    10.00   *             U     divb	(%rax)
-# CHECK-NEXT:  2      11    11.00                 U     divw	%si
-# CHECK-NEXT:  2      15    11.00   *             U     divw	(%rax)
-# CHECK-NEXT:  2      13    13.00                 U     divl	%edx
-# CHECK-NEXT:  2      17    13.00   *             U     divl	(%rax)
-# CHECK-NEXT:  2      17    17.00                 U     divq	%rcx
-# CHECK-NEXT:  2      21    17.00   *             U     divq	(%rax)
+# CHECK-NEXT:  2      9     9.00                  U     divb	%dil
+# CHECK-NEXT:  2      13    9.00    *             U     divb	(%rax)
+# CHECK-NEXT:  2      10    10.00                 U     divw	%si
+# CHECK-NEXT:  2      14    10.00   *             U     divw	(%rax)
+# CHECK-NEXT:  2      12    12.00                 U     divl	%edx
+# CHECK-NEXT:  2      16    12.00   *             U     divl	(%rax)
+# CHECK-NEXT:  2      18    18.00                 U     divq	%rcx
+# CHECK-NEXT:  2      22    18.00   *             U     divq	(%rax)
 # CHECK-NEXT:  100    100   25.00                 U     enter	$7, $4095
-# CHECK-NEXT:  2      10    10.00                 U     idivb	%dil
-# CHECK-NEXT:  2      14    10.00   *             U     idivb	(%rax)
-# CHECK-NEXT:  2      11    11.00                 U     idivw	%si
-# CHECK-NEXT:  2      15    11.00   *             U     idivw	(%rax)
-# CHECK-NEXT:  2      13    13.00                 U     idivl	%edx
-# CHECK-NEXT:  2      17    13.00   *             U     idivl	(%rax)
-# CHECK-NEXT:  2      17    17.00                 U     idivq	%rcx
-# CHECK-NEXT:  2      21    17.00   *             U     idivq	(%rax)
+# CHECK-NEXT:  2      9     9.00                  U     idivb	%dil
+# CHECK-NEXT:  2      13    9.00    *             U     idivb	(%rax)
+# CHECK-NEXT:  2      10    10.00                 U     idivw	%si
+# CHECK-NEXT:  2      14    10.00   *             U     idivw	(%rax)
+# CHECK-NEXT:  2      12    12.00                 U     idivl	%edx
+# CHECK-NEXT:  2      16    12.00   *             U     idivl	(%rax)
+# CHECK-NEXT:  2      18    18.00                 U     idivq	%rcx
+# CHECK-NEXT:  2      22    18.00   *             U     idivq	(%rax)
 # CHECK-NEXT:  1      3     3.00                        imulb	%dil
 # CHECK-NEXT:  1      7     3.00    *                   imulb	(%rax)
 # CHECK-NEXT:  3      3     3.00                        imulw	%di
@@ -1891,12 +1891,12 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      5     0.67    *      *            xaddq	%rax, (%rbx)
 # CHECK-NEXT:  1      5     0.67    *      *            lock		xaddq	%rax, (%rbx)
 # CHECK-NEXT:  2      1     0.50                        xchgb	%bl, %cl
-# CHECK-NEXT:  5      7     0.50    *      *            xchgb	%bl, (%rbx)
-# CHECK-NEXT:  5      7     0.50    *      *            lock		xchgb	%bl, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            xchgb	%bl, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            lock		xchgb	%bl, (%rbx)
 # CHECK-NEXT:  2      1     0.50                        xchgw	%bx, %ax
 # CHECK-NEXT:  2      1     0.50                        xchgw	%bx, %cx
-# CHECK-NEXT:  5      7     0.50    *      *            xchgw	%ax, (%rbx)
-# CHECK-NEXT:  5      7     0.50    *      *            lock		xchgw	%ax, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            xchgw	%ax, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            lock		xchgw	%ax, (%rbx)
 # CHECK-NEXT:  2      0     0.33                        xchgl	%ebx, %eax
 # CHECK-NEXT:  2      0     0.33                        xchgl	%ebx, %ecx
 # CHECK-NEXT:  2      6     0.50    *      *            xchgl	%eax, (%rbx)
@@ -1975,7 +1975,7 @@ xorq (%rax), %rdi
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 259.00 259.00 259.00 1733.00 1865.50 1775.50 1529.50 1.50  -    -      -      -      -      -      -     259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00
+# CHECK-NEXT: 259.00 259.00 259.00 1725.00 1865.50 1775.50 1529.50 1.50  -    -      -      -      -      -      -     259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2266,23 +2266,23 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     decq	%rdi
 # CHECK-NEXT: 0.67   0.67   0.67   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.67   0.67   0.67   0.33   0.33   0.33   0.50   0.50   decq	(%rax)
 # CHECK-NEXT: 0.67   0.67   0.67   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.67   0.67   0.67   0.33   0.33   0.33   0.50   0.50   lock		decq	(%rax)
-# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divb	%dil
-# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divb	(%rax)
-# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divw	%si
-# CHECK-NEXT: 0.33   0.33   0.33   11.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divw	(%rax)
-# CHECK-NEXT:  -      -      -     13.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divl	%edx
-# CHECK-NEXT: 0.33   0.33   0.33   13.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divl	(%rax)
-# CHECK-NEXT:  -      -      -     17.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divq	%rcx
-# CHECK-NEXT: 0.33   0.33   0.33   17.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divq	(%rax)
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divb	%dil
+# CHECK-NEXT: 0.33   0.33   0.33   9.00    -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divb	(%rax)
+# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divw	%si
+# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divw	(%rax)
+# CHECK-NEXT:  -      -      -     12.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divl	%edx
+# CHECK-NEXT: 0.33   0.33   0.33   12.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divl	(%rax)
+# CHECK-NEXT:  -      -      -     18.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divq	%rcx
+# CHECK-NEXT: 0.33   0.33   0.33   18.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divq	(%rax)
 # CHECK-NEXT:  -      -      -     25.00  25.00  25.00  25.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     enter	$7, $4095
-# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	%dil
-# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivb	(%rax)
-# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	%si
-# CHECK-NEXT: 0.33   0.33   0.33   11.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivw	(%rax)
-# CHECK-NEXT:  -      -      -     13.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%edx
-# CHECK-NEXT: 0.33   0.33   0.33   13.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivl	(%rax)
-# CHECK-NEXT:  -      -      -     17.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	%rcx
-# CHECK-NEXT: 0.33   0.33   0.33   17.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivq	(%rax)
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	%dil
+# CHECK-NEXT: 0.33   0.33   0.33   9.00    -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivb	(%rax)
+# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	%si
+# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivw	(%rax)
+# CHECK-NEXT:  -      -      -     12.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%edx
+# CHECK-NEXT: 0.33   0.33   0.33   12.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivl	(%rax)
+# CHECK-NEXT:  -      -      -     18.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	%rcx
+# CHECK-NEXT: 0.33   0.33   0.33   18.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivq	(%rax)
 # CHECK-NEXT:  -      -      -      -     3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     imulb	%dil
 # CHECK-NEXT: 0.33   0.33   0.33    -     3.00    -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     imulb	(%rax)
 # CHECK-NEXT:  -      -      -      -     3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     imulw	%di
diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp
index 47469983..966b1f0 100644
--- a/llvm/unittests/ADT/STLExtrasTest.cpp
+++ b/llvm/unittests/ADT/STLExtrasTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -27,6 +28,7 @@
 using namespace llvm;
 
 using testing::ElementsAre;
+using testing::ElementsAreArray;
 using testing::UnorderedElementsAre;
 
 namespace {
@@ -772,48 +774,30 @@ TEST(STLExtrasTest, DropBeginTest) {
   SmallVector<int, 5> vec{0, 1, 2, 3, 4};
 
   for (int n = 0; n < 5; ++n) {
-    int i = n;
-    for (auto &v : drop_begin(vec, n)) {
-      EXPECT_EQ(v, i);
-      i += 1;
-    }
-    EXPECT_EQ(i, 5);
+    EXPECT_THAT(drop_begin(vec, n),
+                ElementsAreArray(ArrayRef(&vec[n], vec.size() - n)));
   }
 }
 
 TEST(STLExtrasTest, DropBeginDefaultTest) {
   SmallVector<int, 5> vec{0, 1, 2, 3, 4};
 
-  int i = 1;
-  for (auto &v : drop_begin(vec)) {
-    EXPECT_EQ(v, i);
-    i += 1;
-  }
-  EXPECT_EQ(i, 5);
+  EXPECT_THAT(drop_begin(vec), ElementsAre(1, 2, 3, 4));
 }
 
 TEST(STLExtrasTest, DropEndTest) {
   SmallVector<int, 5> vec{0, 1, 2, 3, 4};
 
   for (int n = 0; n < 5; ++n) {
-    int i = 0;
-    for (auto &v : drop_end(vec, n)) {
-      EXPECT_EQ(v, i);
-      i += 1;
-    }
-    EXPECT_EQ(i, 5 - n);
+    EXPECT_THAT(drop_end(vec, n),
+                ElementsAreArray(ArrayRef(vec.data(), vec.size() - n)));
   }
 }
 
 TEST(STLExtrasTest, DropEndDefaultTest) {
   SmallVector<int, 5> vec{0, 1, 2, 3, 4};
 
-  int i = 0;
-  for (auto &v : drop_end(vec)) {
-    EXPECT_EQ(v, i);
-    i += 1;
-  }
-  EXPECT_EQ(i, 4);
+  EXPECT_THAT(drop_end(vec), ElementsAre(0, 1, 2, 3));
 }
 
 TEST(STLExtrasTest, MapRangeTest) {
diff --git a/llvm/unittests/AsmParser/AsmParserTest.cpp b/llvm/unittests/AsmParser/AsmParserTest.cpp
index ce22670..898a829 100644
--- a/llvm/unittests/AsmParser/AsmParserTest.cpp
+++ b/llvm/unittests/AsmParser/AsmParserTest.cpp
@@ -6,7 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/AsmParserContext.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/IR/Constants.h"
@@ -14,10 +16,14 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
+#define DEBUG_TYPE "unittest-asm-parser-tests"
+
 using namespace llvm;
 
 namespace {
@@ -479,4 +485,53 @@ TEST(AsmParserTest, DIExpressionBodyAtBeginningWithSlotMappingParsing) {
   ASSERT_EQ(Mapping.MetadataNodes.size(), 0u);
 }
 
+#define ASSERT_EQ_LOC(Loc1, Loc2)                                              \
+  do {                                                                         \
+    EXPECT_TRUE(Loc1.contains(Loc2) && Loc2.contains(Loc1))                    \
+        << #Loc1 " location: " << Loc1.Start.Line << ":" << Loc1.Start.Col     \
+        << " - " << Loc1.End.Line << ":" << Loc1.End.Col << "\n"               \
+        << #Loc2 " location: " << Loc2.Start.Line << ":" << Loc2.Start.Col     \
+        << " - " << Loc2.End.Line << ":" << Loc2.End.Col << "\n";              \
+  } while (false)
+
+TEST(AsmParserTest, ParserObjectLocations) {
+  StringRef Source = "define i32 @main() {\n"
+                     "entry:\n"
+                     "    %a = add i32 1, 2\n"
+                     "    ret i32 %a\n"
+                     "}\n";
+  LLVMContext Ctx;
+  SMDiagnostic Error;
+  SlotMapping Mapping;
+  AsmParserContext ParserContext;
+  auto Mod = parseAssemblyString(Source, Error, Ctx, &Mapping, &ParserContext);
+
+  auto *MainFn = Mod->getFunction("main");
+  ASSERT_TRUE(MainFn != nullptr);
+
+  auto MaybeMainLoc = ParserContext.getFunctionLocation(MainFn);
+  EXPECT_TRUE(MaybeMainLoc.has_value());
+  auto MainLoc = MaybeMainLoc.value();
+  auto ExpectedMainLoc = FileLocRange(FileLoc{0, 0}, FileLoc{4, 1});
+  ASSERT_EQ_LOC(MainLoc, ExpectedMainLoc);
+
+  auto &EntryBB = MainFn->getEntryBlock();
+  auto MaybeEntryBBLoc = ParserContext.getBlockLocation(&EntryBB);
+  ASSERT_TRUE(MaybeEntryBBLoc.has_value());
+  auto EntryBBLoc = MaybeEntryBBLoc.value();
+  auto ExpectedEntryBBLoc = FileLocRange(FileLoc{1, 0}, FileLoc{3, 14});
+  ASSERT_EQ_LOC(EntryBBLoc, ExpectedEntryBBLoc);
+
+  SmallVector<FileLocRange> InstructionLocations = {
+      FileLocRange(FileLoc{2, 4}, FileLoc{2, 21}),
+      FileLocRange(FileLoc{3, 4}, FileLoc{3, 14})};
+
+  for (const auto &[Inst, ExpectedLoc] : zip(EntryBB, InstructionLocations)) {
+    auto MaybeInstLoc = ParserContext.getInstructionLocation(&Inst);
+    ASSERT_TRUE(MaybeMainLoc.has_value());
+    auto InstLoc = MaybeInstLoc.value();
+    ASSERT_EQ_LOC(InstLoc, ExpectedLoc);
+  }
+}
+
 } // end anonymous namespace
diff --git a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
index 6c08173..af2d56d 100644
--- a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
+++ b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
@@ -383,14 +383,14 @@ class AsmPrinterHandlerTest : public AsmPrinterFixtureBase {
 
   public:
     TestHandler(AsmPrinterHandlerTest &Test) : Test(Test) {}
-    virtual ~TestHandler() {}
-    virtual void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
-    virtual void beginModule(Module *M) override { Test.BeginCount++; }
-    virtual void endModule() override { Test.EndCount++; }
-    virtual void beginFunction(const MachineFunction *MF) override {}
-    virtual void endFunction(const MachineFunction *MF) override {}
-    virtual void beginInstruction(const MachineInstr *MI) override {}
-    virtual void endInstruction() override {}
+    ~TestHandler() override {}
+    void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+    void beginModule(Module *M) override { Test.BeginCount++; }
+    void endModule() override { Test.EndCount++; }
+    void beginFunction(const MachineFunction *MF) override {}
+    void endFunction(const MachineFunction *MF) override {}
+    void beginInstruction(const MachineInstr *MI) override {}
+    void endInstruction() override {}
   };
 
 protected:
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index ce2a38b..ff87e7b 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -69,7 +69,7 @@ public:
 
   InstrRefLDVTest() : Ctx(), Mod(std::make_unique<Module>("beehives", Ctx)) {}
 
-  void SetUp() {
+  void SetUp() override {
     // Boilerplate that creates a MachineFunction and associated blocks.
 
     Mod->setDataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
diff --git a/llvm/unittests/CodeGen/MIR2VecTest.cpp b/llvm/unittests/CodeGen/MIR2VecTest.cpp
index 8710d6b..d42749c 100644
--- a/llvm/unittests/CodeGen/MIR2VecTest.cpp
+++ b/llvm/unittests/CodeGen/MIR2VecTest.cpp
@@ -54,6 +54,9 @@ protected:
   std::unique_ptr<Module> M;
   std::unique_ptr<TargetMachine> TM;
   const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  std::unique_ptr<MachineModuleInfo> MMI;
+  MachineFunction *MF = nullptr;
 
   static void SetUpTestCase() {
     InitializeAllTargets();
@@ -90,15 +93,24 @@ protected:
     Function *F =
         Function::Create(FT, Function::ExternalLinkage, "test", M.get());
 
-    // Get the target instruction info
+    // Create MMI and MF to get TRI and MRI
+    MMI = std::make_unique<MachineModuleInfo>(TM.get());
+    MF = &MMI->getOrCreateMachineFunction(*F);
+
+    // Get the target instruction info and register info
     TII = TM->getSubtargetImpl(*F)->getInstrInfo();
-    if (!TII) {
-      GTEST_SKIP() << "Failed to get target instruction info; Skipping test";
+    TRI = TM->getSubtargetImpl(*F)->getRegisterInfo();
+    if (!TII || !TRI) {
+      GTEST_SKIP()
+          << "Failed to get target instruction/register info; Skipping test";
       return;
     }
   }
 
-  void TearDown() override { TII = nullptr; }
+  void TearDown() override {
+    TII = nullptr;
+    TRI = nullptr;
+  }
 
   // Find an opcode by name
   int findOpcodeByName(StringRef Name) {
@@ -110,17 +122,94 @@ protected:
   }
 
   // Create a vocabulary with specific opcodes and embeddings
-  Expected<MIRVocabulary>
-  createTestVocab(std::initializer_list<std::pair<const char *, float>> opcodes,
-                  unsigned dimension = 2) {
-    assert(TII && "TargetInstrInfo not initialized");
-    VocabMap VMap;
-    for (const auto &[name, value] : opcodes)
-      VMap[name] = Embedding(dimension, value);
-    return MIRVocabulary::create(std::move(VMap), *TII);
+  // This might cause errors in future when the validation in
+  // MIRVocabulary::generateStorage() enforces hard checks on the vocabulary
+  // entries.
+  Expected<MIRVocabulary> createTestVocab(
+      std::initializer_list<std::pair<const char *, float>> Opcodes,
+      std::initializer_list<std::pair<const char *, float>> CommonOperands,
+      std::initializer_list<std::pair<const char *, float>> PhyRegs,
+      std::initializer_list<std::pair<const char *, float>> VirtRegs,
+      unsigned Dimension = 2) {
+    assert(TII && TRI && MF && "Target info not initialized");
+    VocabMap OpcodeMap, CommonOperandMap, PhyRegMap, VirtRegMap;
+    for (const auto &[Name, Value] : Opcodes)
+      OpcodeMap[Name] = Embedding(Dimension, Value);
+
+    for (const auto &[Name, Value] : CommonOperands)
+      CommonOperandMap[Name] = Embedding(Dimension, Value);
+
+    for (const auto &[Name, Value] : PhyRegs)
+      PhyRegMap[Name] = Embedding(Dimension, Value);
+
+    for (const auto &[Name, Value] : VirtRegs)
+      VirtRegMap[Name] = Embedding(Dimension, Value);
+
+    // If any section is empty, create minimal maps for other vocabulary
+    // sections to satisfy validation
+    if (Opcodes.size() == 0)
+      OpcodeMap["NOOP"] = Embedding(Dimension, 0.0f);
+    if (CommonOperands.size() == 0)
+      CommonOperandMap["Immediate"] = Embedding(Dimension, 0.0f);
+    if (PhyRegs.size() == 0)
+      PhyRegMap["GR32"] = Embedding(Dimension, 0.0f);
+    if (VirtRegs.size() == 0)
+      VirtRegMap["GR32"] = Embedding(Dimension, 0.0f);
+
+    return MIRVocabulary::create(
+        std::move(OpcodeMap), std::move(CommonOperandMap), std::move(PhyRegMap),
+        std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo());
   }
 };
 
+// Parameterized test for empty vocab sections
+class MIR2VecVocabEmptySectionTestFixture
+    : public MIR2VecVocabTestFixture,
+      public ::testing::WithParamInterface<int> {
+protected:
+  void SetUp() override {
+    MIR2VecVocabTestFixture::SetUp();
+    // If base class setup was skipped (TII not initialized), skip derived setup
+    if (!TII)
+      GTEST_SKIP() << "Failed to get target instruction info in "
+                      "the base class setup; Skipping test";
+  }
+};
+
+TEST_P(MIR2VecVocabEmptySectionTestFixture, EmptySectionFailsValidation) {
+  int EmptySection = GetParam();
+  VocabMap OpcodeMap, CommonOperandMap, PhyRegMap, VirtRegMap;
+
+  if (EmptySection != 0)
+    OpcodeMap["ADD"] = Embedding(2, 1.0f);
+  if (EmptySection != 1)
+    CommonOperandMap["Immediate"] = Embedding(2, 0.0f);
+  if (EmptySection != 2)
+    PhyRegMap["GR32"] = Embedding(2, 0.0f);
+  if (EmptySection != 3)
+    VirtRegMap["GR32"] = Embedding(2, 0.0f);
+
+  ASSERT_TRUE(TII != nullptr);
+  ASSERT_TRUE(TRI != nullptr);
+  ASSERT_TRUE(MF != nullptr);
+
+  auto VocabOrErr = MIRVocabulary::create(
+      std::move(OpcodeMap), std::move(CommonOperandMap), std::move(PhyRegMap),
+      std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo());
+  EXPECT_FALSE(static_cast<bool>(VocabOrErr))
+      << "Factory method should fail when section " << EmptySection
+      << " is empty";
+
+  if (!VocabOrErr) {
+    auto Err = VocabOrErr.takeError();
+    std::string ErrorMsg = toString(std::move(Err));
+    EXPECT_FALSE(ErrorMsg.empty());
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(EmptySection, MIR2VecVocabEmptySectionTestFixture,
+                         ::testing::Values(0, 1, 2, 3));
+
 TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
   // Test that same base opcodes get same canonical indices
   std::string BaseName1 = MIRVocabulary::extractBaseOpcodeName("ADD16ri");
@@ -133,7 +222,7 @@ TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
   // Create a MIRVocabulary instance to test the mapping
   // Use a minimal MIRVocabulary to trigger canonical mapping construction
   Embedding Val = Embedding(64, 1.0f);
-  auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, 64);
+  auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {}, 64);
   ASSERT_TRUE(static_cast<bool>(TestVocabOrErr))
       << "Failed to create vocabulary: "
       << toString(TestVocabOrErr.takeError());
@@ -190,7 +279,7 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) {
 
   // Create a MIRVocabulary instance to test deterministic mapping
   // Use a minimal MIRVocabulary to trigger canonical mapping construction
-  auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, 64);
+  auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {}, 64);
   ASSERT_TRUE(static_cast<bool>(TestVocabOrErr))
       << "Failed to create vocabulary: "
       << toString(TestVocabOrErr.takeError());
@@ -210,7 +299,8 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) {
 
 // Test MIRVocabulary construction
 TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) {
-  auto VocabOrErr = createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, 128);
+  auto VocabOrErr =
+      createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, {}, {}, {}, 128);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &Vocab = *VocabOrErr;
@@ -231,42 +321,15 @@ TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) {
   EXPECT_GT(Count, 0u);
 }
 
-// Test factory method with empty vocabulary
-TEST_F(MIR2VecVocabTestFixture, EmptyVocabularyCreation) {
-  VocabMap EmptyVMap;
-
-  auto VocabOrErr = MIRVocabulary::create(std::move(EmptyVMap), *TII);
-  EXPECT_FALSE(static_cast<bool>(VocabOrErr))
-      << "Factory method should fail with empty vocabulary";
-
-  // Consume the error
-  if (!VocabOrErr) {
-    auto Err = VocabOrErr.takeError();
-    std::string ErrorMsg = toString(std::move(Err));
-    EXPECT_FALSE(ErrorMsg.empty());
-  }
-}
-
 // Fixture for embedding related tests
 class MIR2VecEmbeddingTestFixture : public MIR2VecVocabTestFixture {
 protected:
-  std::unique_ptr<MachineModuleInfo> MMI;
-  MachineFunction *MF = nullptr;
-
   void SetUp() override {
     MIR2VecVocabTestFixture::SetUp();
     // If base class setup was skipped (TII not initialized), skip derived setup
     if (!TII)
       GTEST_SKIP() << "Failed to get target instruction info in "
                       "the base class setup; Skipping test";
-
-    // Create a dummy function for MachineFunction
-    FunctionType *FT = FunctionType::get(Type::getVoidTy(*Ctx), false);
-    Function *F =
-        Function::Create(FT, Function::ExternalLinkage, "test", M.get());
-
-    MMI = std::make_unique<MachineModuleInfo>(TM.get());
-    MF = &MMI->getOrCreateMachineFunction(*F);
   }
 
   void TearDown() override { MIR2VecVocabTestFixture::TearDown(); }
@@ -298,7 +361,8 @@ protected:
 
 // Test factory method for creating embedder
 TEST_F(MIR2VecEmbeddingTestFixture, CreateSymbolicEmbedder) {
-  auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 1);
+  auto VocabOrErr =
+      MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 1);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &V = *VocabOrErr;
@@ -307,7 +371,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, CreateSymbolicEmbedder) {
 }
 
 TEST_F(MIR2VecEmbeddingTestFixture, CreateInvalidMode) {
-  auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 1);
+  auto VocabOrErr =
+      MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 1);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &V = *VocabOrErr;
@@ -324,7 +389,7 @@ TEST_F(MIR2VecEmbeddingTestFixture, TestSymbolicEmbedder) {
           {"RET", 2.0f},  // [2.0, 2.0, 2.0, 2.0]
           {"TRAP", 3.0f}  // [3.0, 3.0, 3.0, 3.0]
       },
-      4);
+      {}, {}, {}, 4);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &Vocab = *VocabOrErr;
@@ -378,7 +443,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, TestSymbolicEmbedder) {
 // Test embedder with multiple basic blocks
 TEST_F(MIR2VecEmbeddingTestFixture, MultipleBasicBlocks) {
   // Create a test vocabulary
-  auto VocabOrErr = createTestVocab({{"NOOP", 1.0f}, {"TRAP", 2.0f}});
+  auto VocabOrErr =
+      createTestVocab({{"NOOP", 1.0f}, {"TRAP", 2.0f}}, {}, {}, {});
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &Vocab = *VocabOrErr;
@@ -431,7 +497,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, EmptyBasicBlock) {
   MF->push_back(MBB);
 
   // Create embedder
-  auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 2);
+  auto VocabOrErr =
+      MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 2);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &V = *VocabOrErr;
@@ -452,7 +519,7 @@ TEST_F(MIR2VecEmbeddingTestFixture, EmptyBasicBlock) {
 TEST_F(MIR2VecEmbeddingTestFixture, UnknownOpcodes) {
   // Create a test vocabulary with limited entries
   // SUB is intentionally not included
-  auto VocabOrErr = createTestVocab({{"ADD", 1.0f}});
+  auto VocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {});
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &Vocab = *VocabOrErr;
@@ -494,4 +561,210 @@ TEST_F(MIR2VecEmbeddingTestFixture, UnknownOpcodes) {
   Embedding ExpectedBBVector(2, 1.0f * ExpectedWeight);
   EXPECT_TRUE(MBBVector.approximatelyEquals(ExpectedBBVector));
 }
+
+// Test vocabulary string key generation
+TEST_F(MIR2VecEmbeddingTestFixture, VocabularyStringKeys) {
+  auto VocabOrErr =
+      createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, {}, {}, {}, 2);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  // Test that we can get string keys for all positions
+  for (size_t Pos = 0; Pos < Vocab.getCanonicalSize(); ++Pos) {
+    std::string Key = Vocab.getStringKey(Pos);
+    EXPECT_FALSE(Key.empty()) << "Empty key at position " << Pos;
+  }
+
+  // Test specific known positions if we can identify them
+  unsigned AddIndex = Vocab.getCanonicalIndexForBaseName("ADD");
+  std::string AddKey = Vocab.getStringKey(AddIndex);
+  EXPECT_EQ(AddKey, "ADD");
+
+  unsigned SubIndex = Vocab.getCanonicalIndexForBaseName("SUB");
+  std::string SubKey = Vocab.getStringKey(SubIndex);
+  EXPECT_EQ(SubKey, "SUB");
+
+  unsigned ImmIndex = Vocab.getCanonicalIndexForOperandName("Immediate");
+  std::string ImmKey = Vocab.getStringKey(ImmIndex);
+  EXPECT_EQ(ImmKey, "Immediate");
+
+  unsigned PhyRegIndex = Vocab.getCanonicalIndexForRegisterClass("GR32", true);
+  std::string PhyRegKey = Vocab.getStringKey(PhyRegIndex);
+  EXPECT_EQ(PhyRegKey, "PhyReg_GR32");
+
+  unsigned VirtRegIndex =
+      Vocab.getCanonicalIndexForRegisterClass("GR32", false);
+  std::string VirtRegKey = Vocab.getStringKey(VirtRegIndex);
+  EXPECT_EQ(VirtRegKey, "VirtReg_GR32");
+}
+
+// Test vocabulary dimension consistency
+TEST_F(MIR2VecEmbeddingTestFixture, DimensionConsistency) {
+  auto VocabOrErr = createTestVocab({{"TEST", 1.0f}}, {}, {}, {}, 5);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  EXPECT_EQ(Vocab.getDimension(), 5u);
+
+  // All embeddings should have the same dimension
+  for (auto IT = Vocab.begin(); IT != Vocab.end(); ++IT)
+    EXPECT_EQ((*IT).size(), 5u);
+}
+
+// Test invalid register handling through machine instruction creation
+TEST_F(MIR2VecEmbeddingTestFixture, InvalidRegisterHandling) {
+  float MOVValue = 1.5f;
+  float ImmValue = 0.5f;
+  float PhyRegValue = 0.2f;
+  auto VocabOrErr = createTestVocab(
+      {{"MOV", MOVValue}}, {{"Immediate", ImmValue}},
+      {{"GR8_ABCD_H", PhyRegValue}, {"GR8_ABCD_L", PhyRegValue + 0.1f}}, {}, 3);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->push_back(MBB);
+
+  // Create a MOV instruction with actual operands including potential $noreg
+  // This tests the actual scenario where invalid registers are encountered
+  auto MovOpcode = findOpcodeByName("MOV32mr");
+  ASSERT_NE(MovOpcode, -1) << "MOV32mr opcode not found";
+  const MCInstrDesc &Desc = TII->get(MovOpcode);
+
+  // Use available physical registers from the target
+  unsigned BaseReg =
+      TRI->getNumRegs() > 1 ? 1 : 0; // First available physical register
+  unsigned ValueReg = TRI->getNumRegs() > 2 ? 2 : BaseReg;
+
+  // MOV32mr typically has: base, scale, index, displacement, segment, value
+  // Use the MachineInstrBuilder API properly
+  auto MovInst = BuildMI(*MBB, MBB->end(), DebugLoc(), Desc)
+                     .addReg(BaseReg)   // base
+                     .addImm(1)         // scale
+                     .addReg(0)         // index ($noreg)
+                     .addImm(-4)        // displacement
+                     .addReg(0)         // segment ($noreg)
+                     .addReg(ValueReg); // value
+
+  auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+  ASSERT_TRUE(Embedder != nullptr);
+
+  // This should not crash even if the instruction has $noreg operands
+  auto InstEmb = Embedder->getMInstVector(*MovInst);
+  EXPECT_EQ(InstEmb.size(), 3u);
+
+  // Test the expected embedding value
+  Embedding ExpectedOpcodeContribution(3, MOVValue * mir2vec::OpcWeight);
+  auto ExpectedOperandContribution =
+      Embedding(3, PhyRegValue * mir2vec::RegOperandWeight)   // Base
+      + Embedding(3, ImmValue * mir2vec::CommonOperandWeight) // Scale
+      + Embedding(3, 0.0f)                                    // noreg
+      + Embedding(3, ImmValue * mir2vec::CommonOperandWeight) // displacement
+      + Embedding(3, 0.0f)                                    // noreg
+      + Embedding(3, (PhyRegValue + 0.1f) * mir2vec::RegOperandWeight); // Value
+  auto ExpectedEmb = ExpectedOpcodeContribution + ExpectedOperandContribution;
+  EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb))
+      << "MOV instruction embedding should match expected embedding";
+}
+
+// Test handling of both physical and virtual registers in an instruction
+TEST_F(MIR2VecEmbeddingTestFixture, PhysicalAndVirtualRegisterHandling) {
+  float MOVValue = 2.0f;
+  float ImmValue = 0.7f;
+  float PhyRegValue = 0.3f;
+  float VirtRegValue = 0.9f;
+
+  // Find GR32 register class
+  const TargetRegisterClass *GR32RC = nullptr;
+  for (unsigned i = 0; i < TRI->getNumRegClasses(); ++i) {
+    const TargetRegisterClass *RC = TRI->getRegClass(i);
+    if (std::string(TRI->getRegClassName(RC)) == "GR32") {
+      GR32RC = RC;
+      break;
+    }
+  }
+  ASSERT_TRUE(GR32RC != nullptr && GR32RC->isAllocatable())
+      << "No allocatable GR32 register class found";
+
+  // Get first available physical register from GR32
+  unsigned PhyReg = *GR32RC->begin();
+  // Create a virtual register of class GR32
+  unsigned VirtReg = MF->getRegInfo().createVirtualRegister(GR32RC);
+
+  // Create vocabulary with register class based keys
+  auto VocabOrErr =
+      createTestVocab({{"MOV", MOVValue}}, {{"Immediate", ImmValue}},
+                      {{"GR32_AD", PhyRegValue}}, // GR32_AD is the minimal key
+                      {{"GR32", VirtRegValue}}, 4);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->push_back(MBB);
+
+  // Create a MOV32rr instruction: MOV32rr dst, src
+  auto MovOpcode = findOpcodeByName("MOV32rr");
+  ASSERT_NE(MovOpcode, -1) << "MOV32rr opcode not found";
+  const MCInstrDesc &Desc = TII->get(MovOpcode);
+
+  // MOV32rr: dst (physical), src (virtual)
+  auto MovInst = BuildMI(*MBB, MBB->end(), DebugLoc(), Desc)
+                     .addReg(PhyReg)   // physical register destination
+                     .addReg(VirtReg); // virtual register source
+
+  // Create embedder with virtual register support
+  auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+  ASSERT_TRUE(Embedder != nullptr);
+
+  // This should not crash and should produce a valid embedding
+  auto InstEmb = Embedder->getMInstVector(*MovInst);
+  EXPECT_EQ(InstEmb.size(), 4u);
+
+  // Test the expected embedding value
+  Embedding ExpectedOpcodeContribution(4, MOVValue * mir2vec::OpcWeight);
+  auto ExpectedOperandContribution =
+      Embedding(4, PhyRegValue * mir2vec::RegOperandWeight) // dst (physical)
+      + Embedding(4, VirtRegValue * mir2vec::RegOperandWeight); // src (virtual)
+  auto ExpectedEmb = ExpectedOpcodeContribution + ExpectedOperandContribution;
+  EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb))
+      << "MOV32rr instruction embedding should match expected embedding";
+}
+
+// Test precise embedding calculation with known operands
+TEST_F(MIR2VecEmbeddingTestFixture, EmbeddingCalculation) {
+  auto VocabOrErr = createTestVocab({{"NOOP", 2.0f}}, {}, {}, {}, 2);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->push_back(MBB);
+
+  // Create a simple NOOP instruction (no operands)
+  auto NoopInst = createMachineInstr(*MBB, "NOOP");
+  ASSERT_TRUE(NoopInst != nullptr);
+
+  auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+  ASSERT_TRUE(Embedder != nullptr);
+
+  // Get the instruction embedding
+  auto InstEmb = Embedder->getMInstVector(*NoopInst);
+  EXPECT_EQ(InstEmb.size(), 2u);
+
+  // For NOOP with no operands, the embedding should be exactly the opcode
+  // embedding
+  float ExpectedWeight = mir2vec::OpcWeight;
+  Embedding ExpectedEmb(2, 2.0f * ExpectedWeight);
+
+  EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb))
+      << "NOOP instruction embedding should match opcode embedding";
+
+  // Verify individual components
+  EXPECT_FLOAT_EQ(InstEmb[0], 2.0f * ExpectedWeight);
+  EXPECT_FLOAT_EQ(InstEmb[1], 2.0f * ExpectedWeight);
+}
 } // namespace
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 16b9979..aa56aaf 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -550,6 +550,31 @@ TEST_F(SelectionDAGPatternMatchTest, matchNode) {
   EXPECT_FALSE(sd_match(Add, m_Node(ISD::ADD, m_ConstInt(), m_Value())));
 }
 
+TEST_F(SelectionDAGPatternMatchTest, matchSelectLike) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+
+  SDValue Cond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 0, Int32VT);
+  SDValue TVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue FVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT);
+
+  SDValue VCond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 0, VInt32VT);
+  SDValue VTVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, VInt32VT);
+  SDValue VFVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, VInt32VT);
+
+  SDValue Select = DAG->getNode(ISD::SELECT, DL, Int32VT, Cond, TVal, FVal);
+  SDValue VSelect =
+      DAG->getNode(ISD::VSELECT, DL, Int32VT, VCond, VTVal, VFVal);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Select, m_SelectLike(m_Specific(Cond), m_Specific(TVal),
+                                            m_Specific(FVal))));
+  EXPECT_TRUE(
+      sd_match(VSelect, m_SelectLike(m_Specific(VCond), m_Specific(VTVal),
+                                     m_Specific(VFVal))));
+}
+
 namespace {
 struct VPMatchContext : public SDPatternMatch::BasicMatchContext {
   using SDPatternMatch::BasicMatchContext::BasicMatchContext;
diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
index 3c9374b..4235c93 100644
--- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
@@ -716,3 +716,32 @@ attributes #0 = { presplitcoroutine }
   EXPECT_FALSE(llvm::isPresplitCoroSuspendExitEdge(
       *ExitN.getSinglePredecessor(), ExitN));
 }
+
+TEST(BasicBlockUtils, BasicBlockPrintable) {
+  std::string S;
+  std::string SCheck;
+  llvm::raw_string_ostream OS{S};
+  llvm::raw_string_ostream OSCheck{SCheck};
+
+  LLVMContext C;
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @foo() {
+  br label %bb0
+bb0:
+  br label %.exit
+.exit:
+  ret void
+}
+)IR");
+
+  Function *F = M->getFunction("foo");
+  for (const BasicBlock &BB : *F) {
+    OS << printBasicBlock(&BB);
+    BB.printAsOperand(OSCheck);
+    EXPECT_EQ(OS.str(), OSCheck.str());
+    S.clear();
+    SCheck.clear();
+  }
+  OS << printBasicBlock(nullptr);
+  EXPECT_EQ(OS.str(), "<nullptr>");
+}
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index d1b14fb..0b90f91 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -285,7 +285,7 @@ emitGetNamedOperandIdx(raw_ostream &OS,
 
 static void
 emitGetOperandIdxName(raw_ostream &OS,
-                      MapVector<StringRef, unsigned> OperandNameToID,
+                      const MapVector<StringRef, unsigned> &OperandNameToID,
                       const MapVector<SmallVector<int>, unsigned> &OperandMap,
                       unsigned MaxNumOperands, unsigned NumOperandNames) {
   OS << "LLVM_READONLY OpName getOperandIdxName(uint16_t Opcode, int16_t Idx) "
diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 38ba466..df9ddf9 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -45,6 +45,7 @@ static_library("Support") {
     "ARMAttributeParser.cpp",
     "ARMBuildAttributes.cpp",
     "ARMWinEH.cpp",
+    "AllocToken.cpp",
     "Allocator.cpp",
     "AutoConvert.cpp",
     "BalancedPartitioning.cpp",
diff --git a/llvm/utils/update_mc_test_checks.py b/llvm/utils/update_mc_test_checks.py
index ab7fe19..67fff56 100755
--- a/llvm/utils/update_mc_test_checks.py
+++ b/llvm/utils/update_mc_test_checks.py
@@ -290,11 +290,9 @@ def update_test(ti: common.TestInfo):
 
         # prefix is selected and generated with most shared output lines
         # each run_id can only be used once
-        gen_prefix = ""
         used_runid = set()
 
-        # line number diff between generated prefix and testline
-        line_offset = 1
+        selected_prefixes = set()
         for prefix, tup in p_dict_sorted.items():
             o, run_ids = tup
 
@@ -308,18 +306,24 @@ def update_test(ti: common.TestInfo):
                 else:
                     used_runid.add(i)
             if not skip:
-                used_prefixes.add(prefix)
+                selected_prefixes.add(prefix)
 
-                if hasErr(o):
-                    newline = getErrCheckLine(prefix, o, mc_mode, line_offset)
-                else:
-                    newline = getStdCheckLine(prefix, o, mc_mode)
+        # Generate check lines in alphabetical order.
+        check_lines = []
+        for prefix in sorted(selected_prefixes):
+            o, run_ids = p_dict[prefix]
+            used_prefixes.add(prefix)
+
+            if hasErr(o):
+                line_offset = len(check_lines) + 1
+                check = getErrCheckLine(prefix, o, mc_mode, line_offset)
+            else:
+                check = getStdCheckLine(prefix, o, mc_mode)
 
-                if newline:
-                    gen_prefix += newline
-                    line_offset += 1
+            if check:
+                check_lines.append(check.strip())
 
-        generated_prefixes[input_line] = gen_prefix.rstrip("\n")
+        generated_prefixes[input_line] = "\n".join(check_lines)
 
     # write output
     for input_info in ti.iterlines(output_lines):
diff --git a/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h b/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h
new file mode 100644
index 0000000..0833462
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h
@@ -0,0 +1,135 @@
+//===- OpenACCSupport.h - OpenACC Support Interface -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the OpenACCSupport analysis interface, which provides
+// extensible support for OpenACC passes. Custom implementations
+// can be registered to provide pipeline and dialect-specific information
+// that cannot be adequately expressed through type or operation interfaces
+// alone.
+//
+// Usage Pattern:
+// ==============
+//
+// A pass that needs this functionality should call
+// getAnalysis<OpenACCSupport>(), which will provide either:
+// - A cached version if previously initialized, OR
+// - A default implementation if not previously initialized
+//
+// This analysis is never invalidated (isInvalidated returns false), so it only
+// needs to be initialized once and will persist throughout the pass pipeline.
+//
+// Registering a Custom Implementation:
+// =====================================
+//
+// If a custom implementation is needed, create a pass that runs BEFORE the pass
+// that needs the analysis. In this setup pass, use
+// getAnalysis<OpenACCSupport>() followed by setImplementation() to register
+// your custom implementation. The custom implementation will need to provide
+// implementation for all methods defined in the `OpenACCSupportTraits::Concept`
+// class.
+//
+// Example:
+//   void MySetupPass::runOnOperation() {
+//     OpenACCSupport &support = getAnalysis<OpenACCSupport>();
+//     support.setImplementation(MyCustomImpl());
+//   }
+//
+//   void MyAnalysisConsumerPass::runOnOperation() {
+//     OpenACCSupport &support = getAnalysis<OpenACCSupport>();
+//     std::string name = support.getVariableName(someValue);
+//     // ... use the analysis results
+//   }
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENACC_ANALYSIS_OPENACCSUPPORT_H
+#define MLIR_DIALECT_OPENACC_ANALYSIS_OPENACCSUPPORT_H
+
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/AnalysisManager.h"
+#include <memory>
+#include <string>
+
+namespace mlir {
+namespace acc {
+
+namespace detail {
+/// This class contains internal trait classes used by OpenACCSupport.
+/// It follows the Concept-Model pattern used throughout MLIR (e.g., in
+/// AliasAnalysis and interface definitions).
+struct OpenACCSupportTraits {
+  class Concept {
+  public:
+    virtual ~Concept() = default;
+
+    /// Get the variable name for a given MLIR value.
+    virtual std::string getVariableName(Value v) = 0;
+  };
+
+  /// This class wraps a concrete OpenACCSupport implementation and forwards
+  /// interface calls to it. This provides type erasure, allowing different
+  /// implementation types to be used interchangeably without inheritance.
+  template <typename ImplT>
+  class Model final : public Concept {
+  public:
+    explicit Model(ImplT &&impl) : impl(std::forward<ImplT>(impl)) {}
+    ~Model() override = default;
+
+    std::string getVariableName(Value v) final {
+      return impl.getVariableName(v);
+    }
+
+  private:
+    ImplT impl;
+  };
+};
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// OpenACCSupport
+//===----------------------------------------------------------------------===//
+
+class OpenACCSupport {
+  using Concept = detail::OpenACCSupportTraits::Concept;
+  template <typename ImplT>
+  using Model = detail::OpenACCSupportTraits::Model<ImplT>;
+
+public:
+  OpenACCSupport() = default;
+  OpenACCSupport(Operation *op) {}
+
+  /// Register a custom OpenACCSupport implementation. Only one implementation
+  /// can be registered at a time; calling this replaces any existing
+  /// implementation.
+  template <typename AnalysisT>
+  void setImplementation(AnalysisT &&analysis) {
+    impl =
+        std::make_unique<Model<AnalysisT>>(std::forward<AnalysisT>(analysis));
+  }
+
+  /// Get the variable name for a given value.
+  ///
+  /// \param v The MLIR value to get the variable name for.
+  /// \return The variable name, or an empty string if unavailable.
+  std::string getVariableName(Value v);
+
+  /// Signal that this analysis should always be preserved so that
+  /// underlying implementation registration is not lost.
+  bool isInvalidated(const AnalysisManager::PreservedAnalyses &pa) {
+    return false;
+  }
+
+private:
+  /// The registered custom implementation (if any).
+  std::unique_ptr<Concept> impl;
+};
+
+} // namespace acc
+} // namespace mlir
+
+#endif // MLIR_DIALECT_OPENACC_ANALYSIS_OPENACCSUPPORT_H
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
index 378f434..0ee88c6 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
@@ -38,6 +38,11 @@ std::optional<ClauseDefaultValue> getDefaultAttr(mlir::Operation *op);
 /// Get the type category of an OpenACC variable.
 mlir::acc::VariableTypeCategory getTypeCategory(mlir::Value var);
 
+/// Attempts to extract the variable name from a value by walking through
+/// view-like operations until an `acc.var_name` attribute is found. Returns
+/// empty string if no name is found.
+std::string getVariableName(mlir::Value v);
+
 } // namespace acc
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
index b6c8dba..691163d 100644
--- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(Transforms)
+
 set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Frontend/OpenMP/OMP.td)
 mlir_tablegen(OmpCommon.td --gen-directive-decl --directives-dialect=OpenMP)
 add_mlir_dialect_tablegen_target(omp_common_td)
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
new file mode 100644
index 0000000..22f0d92
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name OpenMP)
+add_public_tablegen_target(MLIROpenMPPassIncGen)
+
+add_mlir_doc(Passes OpenMPPasses ./ -gen-pass-doc)
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
new file mode 100644
index 0000000..21b6d1f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
@@ -0,0 +1,26 @@
+//===- Passes.h - OpenMP Pass Construction and Registration -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
+#define MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+namespace omp {
+
+/// Generate the code for registering conversion passes.
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+
+} // namespace omp
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
new file mode 100644
index 0000000..1fde7e0
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
@@ -0,0 +1,26 @@
+//===-- Passes.td - OpenMP pass definition file ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
+#define MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prepare", "ModuleOp"> {
+    let summary = "Prepare OpenMP maps for privatization for deferred target tasks";
+    let description = [{
+      When generating LLVMIR for privatized variables in an OpenMP offloading directive (eg. omp::TargetOp)
+      that creates a deferred target task (when the nowait clause is used), we need to copy the privatized
+      variable out of the stack of the generating task and into the heap so that the deferred target task
+      can still access it. However, if such a privatized variable is also mapped, typically the case for
+      allocatables, then the corresponding `omp::MapInfoOp` needs to be fixed up to map the new heap-allocated
+      variable and not the original variable.
+    }];
+  let dependentDialects = ["LLVM::LLVMDialect"];
+}
+#endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 85f0fd1d..9b15435 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1927,16 +1927,16 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneSwapOp> {
       else
         llvm_unreachable("unsupported row length");
 
-      const Value vdst0 = LLVM::ExtractValueOp::create(rewriter, loc, res, {0});
-      const Value vdst1 = LLVM::ExtractValueOp::create(rewriter, loc, res, {1});
+      Value vdst0 = LLVM::ExtractValueOp::create(rewriter, loc, res, {0});
+      Value vdst1 = LLVM::ExtractValueOp::create(rewriter, loc, res, {1});
 
-      const Value isEqual =
-          rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::eq, vdst0, v);
+      Value isEqual = LLVM::ICmpOp::create(rewriter, loc,
+                                           LLVM::ICmpPredicate::eq, vdst0, v);
 
       // Per `permlane(16|32)` semantics: if the first extracted element equals
       // 'v', the result is the second element; otherwise it is the first.
       Value vdstNew =
-          rewriter.create<LLVM::SelectOp>(loc, isEqual, vdst1, vdst0);
+          LLVM::SelectOp::create(rewriter, loc, isEqual, vdst1, vdst0);
       permuted.emplace_back(vdstNew);
     }
 
diff --git a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
index 42099aa..12adfe1 100644
--- a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
+++ b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
@@ -93,11 +93,11 @@ struct PowiOpToROCDLLibraryCalls : public OpRewritePattern<complex::PowiOp> {
 
     Location loc = op.getLoc();
     Value exponentReal =
-        rewriter.create<arith::SIToFPOp>(loc, exponentFloatType, op.getRhs());
-    Value zeroImag = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getZeroAttr(exponentFloatType));
-    Value exponent = rewriter.create<complex::CreateOp>(
-        loc, op.getLhs().getType(), exponentReal, zeroImag);
+        arith::SIToFPOp::create(rewriter, loc, exponentFloatType, op.getRhs());
+    Value zeroImag = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getZeroAttr(exponentFloatType));
+    Value exponent = complex::CreateOp::create(
+        rewriter, loc, op.getLhs().getType(), exponentReal, zeroImag);
 
     rewriter.replaceOpWithNewOp<complex::PowOp>(op, op.getType(), op.getLhs(),
                                                 exponent, op.getFastmathAttr());
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 5613e02..0fe7239 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -937,14 +937,14 @@ struct PowiOpConversion : public OpConversionPattern<complex::PowiOp> {
     auto elementType = cast<FloatType>(type.getElementType());
 
     Value floatExponent =
-        builder.create<arith::SIToFPOp>(elementType, adaptor.getRhs());
+        arith::SIToFPOp::create(builder, elementType, adaptor.getRhs());
     Value zero = arith::ConstantOp::create(
         builder, elementType, builder.getFloatAttr(elementType, 0.0));
     Value complexExponent =
         complex::CreateOp::create(builder, type, floatExponent, zero);
 
-    auto pow = builder.create<complex::PowOp>(
-        type, adaptor.getLhs(), complexExponent, op.getFastmathAttr());
+    auto pow = complex::PowOp::create(builder, type, adaptor.getLhs(),
+                                      complexExponent, op.getFastmathAttr());
     rewriter.replaceOp(op, pow.getResult());
     return success();
   }
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 852c50c..d64c4d6 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -500,19 +500,19 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
           op->getParentWithTrait<mlir::OpTrait::AutomaticAllocationScope>();
       assert(scope && "Expected op to be inside automatic allocation scope");
       rewriter.setInsertionPointToStart(&scope->getRegion(0).front());
-      auto one = rewriter.create<LLVM::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(1));
+      auto one = LLVM::ConstantOp::create(rewriter, loc, rewriter.getI32Type(),
+                                          rewriter.getI32IntegerAttr(1));
       sinPtr =
-          rewriter.create<LLVM::AllocaOp>(loc, ptrType, computeType, one, 0);
+          LLVM::AllocaOp::create(rewriter, loc, ptrType, computeType, one, 0);
       cosPtr =
-          rewriter.create<LLVM::AllocaOp>(loc, ptrType, computeType, one, 0);
+          LLVM::AllocaOp::create(rewriter, loc, ptrType, computeType, one, 0);
     }
 
     createSincosCall(rewriter, loc, sincosFunc, convertedInput, sinPtr, cosPtr,
                      op);
 
-    auto sinResult = rewriter.create<LLVM::LoadOp>(loc, computeType, sinPtr);
-    auto cosResult = rewriter.create<LLVM::LoadOp>(loc, computeType, cosPtr);
+    auto sinResult = LLVM::LoadOp::create(rewriter, loc, computeType, sinPtr);
+    auto cosResult = LLVM::LoadOp::create(rewriter, loc, computeType, cosPtr);
 
     rewriter.replaceOp(op, {maybeTrunc(sinResult, inputType, rewriter),
                             maybeTrunc(cosResult, inputType, rewriter)});
@@ -522,14 +522,15 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
 private:
   Value maybeExt(Value operand, PatternRewriter &rewriter) const {
     if (isa<Float16Type, BFloat16Type>(operand.getType()))
-      return rewriter.create<LLVM::FPExtOp>(
-          operand.getLoc(), Float32Type::get(rewriter.getContext()), operand);
+      return LLVM::FPExtOp::create(rewriter, operand.getLoc(),
+                                   Float32Type::get(rewriter.getContext()),
+                                   operand);
     return operand;
   }
 
   Value maybeTrunc(Value operand, Type type, PatternRewriter &rewriter) const {
     if (operand.getType() != type)
-      return rewriter.create<LLVM::FPTruncOp>(operand.getLoc(), type, operand);
+      return LLVM::FPTruncOp::create(rewriter, operand.getLoc(), type, operand);
     return operand;
   }
 
@@ -556,7 +557,7 @@ private:
     }
 
     SmallVector<Value> callOperands = {input, sinPtr, cosPtr};
-    rewriter.create<LLVM::CallOp>(loc, funcOp, callOperands);
+    LLVM::CallOp::create(rewriter, loc, funcOp, callOperands);
   }
 };
 
diff --git a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
index 229e40e..7cce324 100644
--- a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
+++ b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
@@ -142,8 +142,8 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
     auto structType = LLVM::LLVMStructType::getLiteral(
         rewriter.getContext(), {llvmOperandType, llvmOperandType});
 
-    auto sincosOp = rewriter.create<LLVM::SincosOp>(
-        loc, structType, adaptor.getOperand(), attrs.getAttrs());
+    auto sincosOp = LLVM::SincosOp::create(
+        rewriter, loc, structType, adaptor.getOperand(), attrs.getAttrs());
 
     auto sinValue = LLVM::ExtractValueOp::create(rewriter, loc, sincosOp, 0);
     auto cosValue = LLVM::ExtractValueOp::create(rewriter, loc, sincosOp, 1);
diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
index 519d9c8..71e3f88 100644
--- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
+++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
@@ -394,9 +394,9 @@ private:
       if (!convertedType)
         return rewriter.notifyMatchFailure(whileOp, "type conversion failed");
 
-      emitc::VariableOp var = rewriter.create<emitc::VariableOp>(
-          loc, emitc::LValueType::get(convertedType), noInit);
-      rewriter.create<emitc::AssignOp>(loc, var.getResult(), init);
+      auto var = emitc::VariableOp::create(
+          rewriter, loc, emitc::LValueType::get(convertedType), noInit);
+      emitc::AssignOp::create(rewriter, loc, var.getResult(), init);
       loopVars.push_back(var);
     }
 
@@ -411,11 +411,11 @@ private:
     // Create a global boolean variable to store the loop condition state.
     Type i1Type = IntegerType::get(context, 1);
     auto globalCondition =
-        rewriter.create<emitc::VariableOp>(loc, emitc::LValueType::get(i1Type),
-                                           emitc::OpaqueAttr::get(context, ""));
+        emitc::VariableOp::create(rewriter, loc, emitc::LValueType::get(i1Type),
+                                  emitc::OpaqueAttr::get(context, ""));
     Value conditionVal = globalCondition.getResult();
 
-    auto loweredDo = rewriter.create<emitc::DoOp>(loc);
+    auto loweredDo = emitc::DoOp::create(rewriter, loc);
 
     // Convert region types to match the target dialect type system.
     if (failed(rewriter.convertRegionTypes(&whileOp.getBefore(),
@@ -450,12 +450,12 @@ private:
 
     // Convert scf.condition to condition variable assignment.
     Value condition = rewriter.getRemappedValue(condOp.getCondition());
-    rewriter.create<emitc::AssignOp>(loc, conditionVal, condition);
+    emitc::AssignOp::create(rewriter, loc, conditionVal, condition);
 
     // Wrap body region in conditional to preserve scf semantics. Only create
     // ifOp if after-region is non-empty.
     if (whileOp.getAfterBody()->getOperations().size() > 1) {
-      auto ifOp = rewriter.create<emitc::IfOp>(loc, condition, false, false);
+      auto ifOp = emitc::IfOp::create(rewriter, loc, condition, false, false);
 
       // Prepare the after region (loop body) for merging.
       Block *afterBlock = &whileOp.getAfter().front();
@@ -480,8 +480,8 @@ private:
     Block *condBlock = rewriter.createBlock(&condRegion);
     rewriter.setInsertionPointToStart(condBlock);
 
-    auto exprOp = rewriter.create<emitc::ExpressionOp>(
-        loc, i1Type, conditionVal, /*do_not_inline=*/false);
+    auto exprOp = emitc::ExpressionOp::create(
+        rewriter, loc, i1Type, conditionVal, /*do_not_inline=*/false);
     Block *exprBlock = rewriter.createBlock(&exprOp.getBodyRegion());
 
     // Set up the expression block to load the condition variable.
@@ -490,12 +490,12 @@ private:
 
     // Load the condition value and yield it as the expression result.
     Value cond =
-        rewriter.create<emitc::LoadOp>(loc, i1Type, exprBlock->getArgument(0));
-    rewriter.create<emitc::YieldOp>(loc, cond);
+        emitc::LoadOp::create(rewriter, loc, i1Type, exprBlock->getArgument(0));
+    emitc::YieldOp::create(rewriter, loc, cond);
 
     // Yield the expression as the condition region result.
     rewriter.setInsertionPointToEnd(condBlock);
-    rewriter.create<emitc::YieldOp>(loc, exprOp);
+    emitc::YieldOp::create(rewriter, loc, exprOp);
 
     return success();
   }
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 00df14b1..29afdc2 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -232,16 +232,16 @@ static Value createLinalgBodyCalculationForElementwiseOp(
         }
 
         intermediateType = rewriter.getIntegerType(intermediateBitWidth);
-        zpAddValue = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIntegerAttr(intermediateType, zpAdd));
+        zpAddValue = arith::ConstantOp::create(
+            rewriter, loc, rewriter.getIntegerAttr(intermediateType, zpAdd));
       } else {
         intermediateType = rewriter.getIntegerType(intermediateBitWidth);
         auto arg1 =
-            rewriter.create<arith::ExtSIOp>(loc, intermediateType, args[1]);
+            arith::ExtSIOp::create(rewriter, loc, intermediateType, args[1]);
         auto arg2 =
-            rewriter.create<arith::ExtSIOp>(loc, intermediateType, args[2]);
+            arith::ExtSIOp::create(rewriter, loc, intermediateType, args[2]);
         zpAddValue =
-            rewriter.create<arith::AddIOp>(loc, intermediateType, arg1, arg2);
+            arith::AddIOp::create(rewriter, loc, intermediateType, arg1, arg2);
       }
 
       // The negation can be applied by doing:
@@ -1402,8 +1402,8 @@ static Value collapse1xNTensorToN(PatternRewriter &rewriter, Value input,
   auto elemType = inputType.getElementType();
   auto collapsedType = RankedTensorType::get({}, elemType);
   // Emit the collapse op
-  return rewriter.create<tensor::CollapseShapeOp>(loc, collapsedType, input,
-                                                  reassociation);
+  return tensor::CollapseShapeOp::create(rewriter, loc, collapsedType, input,
+                                         reassociation);
 }
 
 static llvm::SmallVector<int8_t>
@@ -1443,7 +1443,7 @@ static void setupLinalgGenericOpInputAndIndexingMap(
       IntegerAttr intAttr = isShift
                                 ? rewriter.getI8IntegerAttr(values.front())
                                 : rewriter.getI32IntegerAttr(values.front());
-      constant = rewriter.create<arith::ConstantOp>(loc, intAttr);
+      constant = arith::ConstantOp::create(rewriter, loc, intAttr);
     } else {
       auto elementType =
           isShift ? rewriter.getIntegerType(8) : rewriter.getI32Type();
@@ -1511,14 +1511,14 @@ static Value getExtendZp(OpBuilder &builder, Type valueTy,
                 .getResult(0);
       }
       if (zpTy.isUnsignedInteger()) {
-        return builder.create<arith::ExtUIOp>(loc, extendType, result);
+        return arith::ExtUIOp::create(builder, loc, extendType, result);
       } else {
-        return builder.create<arith::ExtSIOp>(loc, extendType, result);
+        return arith::ExtSIOp::create(builder, loc, extendType, result);
       }
     }
   } else {
-    return builder.create<arith::ConstantOp>(
-        loc, IntegerAttr::get(extendType, *maybeZp));
+    return arith::ConstantOp::create(builder, loc,
+                                     IntegerAttr::get(extendType, *maybeZp));
   }
   return result;
 }
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
index d4ff095..37a45d4 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
@@ -18,4 +18,5 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms
   MLIRPass
   MLIRTransforms
   MLIRNVVMDialect
+  MLIROpenMPDialect
   )
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 9a8a63e..794dda9 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -437,13 +437,15 @@ transform::PromoteTensorOp::apply(transform::TransformRewriter &rewriter,
     for (auto [pos, dim] : llvm::enumerate(type.getShape())) {
       if (!ShapedType::isDynamic(dim))
         continue;
-      Value cst = rewriter.create<arith::ConstantIndexOp>(tensor.getLoc(), pos);
-      auto dimOp = rewriter.create<tensor::DimOp>(tensor.getLoc(), tensor, cst);
+      Value cst =
+          arith::ConstantIndexOp::create(rewriter, tensor.getLoc(), pos);
+      auto dimOp =
+          tensor::DimOp::create(rewriter, tensor.getLoc(), tensor, cst);
       preservedOps.insert(dimOp);
       dynamicDims.push_back(dimOp);
     }
-    auto allocation = rewriter.create<bufferization::AllocTensorOp>(
-        tensor.getLoc(), type, dynamicDims);
+    auto allocation = bufferization::AllocTensorOp::create(
+        rewriter, tensor.getLoc(), type, dynamicDims);
     // Set memory space if provided.
     if (getMemorySpaceAttr())
       allocation.setMemorySpaceAttr(getMemorySpaceAttr());
@@ -452,8 +454,8 @@ transform::PromoteTensorOp::apply(transform::TransformRewriter &rewriter,
     // Only insert a materialization (typically bufferizes to a copy) when the
     // value may be read from.
     if (needsMaterialization) {
-      auto copy = rewriter.create<bufferization::MaterializeInDestinationOp>(
-          tensor.getLoc(), tensor, allocated);
+      auto copy = bufferization::MaterializeInDestinationOp::create(
+          rewriter, tensor.getLoc(), tensor, allocated);
       preservedOps.insert(copy);
       promoted.push_back(copy.getResult());
     } else {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp
index 15eb51a..5e10ba3 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Interfaces/RuntimeVerifiableOpInterface.h"
 
@@ -43,6 +44,33 @@ struct StructuredOpInterface
     auto zero = arith::ConstantIndexOp::create(builder, loc, 0);
     auto one = arith::ConstantIndexOp::create(builder, loc, 1);
 
+    Value iterationDomainIsNonDegenerate;
+    for (auto [start, end] : llvm::zip(starts, ends)) {
+      auto startValue = getValueOrCreateConstantIndexOp(builder, loc, start);
+      auto endValue = getValueOrCreateConstantIndexOp(builder, loc, end);
+
+      // Loop Trip count > 0 iff start < end
+      Value dimensionHasNonZeroTripCount = index::CmpOp::create(
+          builder, loc, index::IndexCmpPredicate::SLT, startValue, endValue);
+
+      if (!iterationDomainIsNonDegenerate) {
+        iterationDomainIsNonDegenerate = dimensionHasNonZeroTripCount;
+      } else {
+        // Iteration domain is non-degenerate iff all dimensions have loop trip
+        // count > 0
+        iterationDomainIsNonDegenerate =
+            arith::AndIOp::create(builder, loc, iterationDomainIsNonDegenerate,
+                                  dimensionHasNonZeroTripCount);
+      }
+    }
+
+    if (!iterationDomainIsNonDegenerate)
+      return;
+
+    auto ifOp = scf::IfOp::create(builder, loc, iterationDomainIsNonDegenerate,
+                                  /*withElseRegion=*/false);
+    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+
     // Subtract one from the loop ends before composing with the indexing map
     transform(ends, ends.begin(), [&](OpFoldResult end) {
       auto endValue = getValueOrCreateConstantIndexOp(builder, loc, end);
@@ -110,6 +138,7 @@ struct StructuredOpInterface
         builder.createOrFold<cf::AssertOp>(loc, cmpOp, msg);
       }
     }
+    builder.setInsertionPointAfter(ifOp);
   }
 };
 
diff --git a/mlir/lib/Dialect/OpenACC/Analysis/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Analysis/CMakeLists.txt
new file mode 100644
index 0000000..f305068
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Analysis/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_mlir_dialect_library(MLIROpenACCAnalysis
+  OpenACCSupport.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenACC
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIROpenACCDialect
+  MLIROpenACCUtils
+  MLIRSupport
+)
+
diff --git a/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp b/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp
new file mode 100644
index 0000000..f6b4534
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp
@@ -0,0 +1,26 @@
+//===- OpenACCSupport.cpp - OpenACCSupport Implementation -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenACCSupport analysis interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtils.h"
+
+namespace mlir {
+namespace acc {
+
+std::string OpenACCSupport::getVariableName(Value v) {
+  if (impl)
+    return impl->getVariableName(v);
+  return acc::getVariableName(v);
+}
+
+} // namespace acc
+} // namespace mlir
diff --git a/mlir/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/CMakeLists.txt
index 7117520..e8a916e 100644
--- a/mlir/lib/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(Analysis)
 add_subdirectory(IR)
 add_subdirectory(Utils)
 add_subdirectory(Transforms)
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
index 1223325..89adda82 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/OpenACC/OpenACCUtils.h"
 
 #include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region &region) {
@@ -78,3 +79,30 @@ mlir::acc::VariableTypeCategory mlir::acc::getTypeCategory(mlir::Value var) {
         pointerLikeTy.getElementType());
   return typeCategory;
 }
+
+std::string mlir::acc::getVariableName(mlir::Value v) {
+  Value current = v;
+
+  // Walk through view operations until a name is found or can't go further
+  while (Operation *definingOp = current.getDefiningOp()) {
+    // Check for `acc.var_name` attribute
+    if (auto varNameAttr =
+            definingOp->getAttrOfType<VarNameAttr>(getVarNameAttrName()))
+      return varNameAttr.getName().str();
+
+    // If it is a data entry operation, get name via getVarName
+    if (isa<ACC_DATA_ENTRY_OPS>(definingOp))
+      if (auto name = acc::getVarName(definingOp))
+        return name->str();
+
+    // If it's a view operation, continue to the source
+    if (auto viewOp = dyn_cast<ViewLikeOpInterface>(definingOp)) {
+      current = viewOp.getViewSource();
+      continue;
+    }
+
+    break;
+  }
+
+  return "";
+}
diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
index 57a6d34..f3c02da 100644
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(Transforms)
+
 add_mlir_dialect_library(MLIROpenMPDialect
   IR/OpenMPDialect.cpp
 
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
new file mode 100644
index 0000000..b9b8eda
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_mlir_dialect_library(MLIROpenMPTransforms
+  OpenMPOffloadPrivatizationPrepare.cpp
+
+  DEPENDS
+  MLIROpenMPPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRFuncDialect
+  MLIRLLVMDialect
+  MLIROpenMPDialect
+  MLIRPass
+  MLIRTransforms
+  )
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
new file mode 100644
index 0000000..db54eaa
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -0,0 +1,442 @@
+//===- OpenMPOffloadPrivatizationPrepare.cpp - Prepare OMP privatization --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+//===----------------------------------------------------------------------===//
+// A pass that prepares OpenMP code for translation of delayed privatization
+// in the context of deferred target tasks. Deferred target tasks are created
+// when the nowait clause is used on the target directive.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "omp-prepare-for-offload-privatization"
+
+namespace mlir {
+namespace omp {
+
+#define GEN_PASS_DEF_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+
+} // namespace omp
+} // namespace mlir
+
+using namespace mlir;
+namespace {
+
+//===----------------------------------------------------------------------===//
+// PrepareForOMPOffloadPrivatizationPass
+//===----------------------------------------------------------------------===//
+
+class PrepareForOMPOffloadPrivatizationPass
+    : public omp::impl::PrepareForOMPOffloadPrivatizationPassBase<
+          PrepareForOMPOffloadPrivatizationPass> {
+
+  void runOnOperation() override {
+    ModuleOp mod = getOperation();
+
+    // In this pass, we make host-allocated privatized variables persist for
+    // deferred target tasks by copying them to the heap. Once the target task
+    // is done, this heap memory is freed. Since all of this happens on the host
+    // we can skip device modules.
+    auto offloadModuleInterface =
+        dyn_cast<omp::OffloadModuleInterface>(mod.getOperation());
+    if (offloadModuleInterface && offloadModuleInterface.getIsTargetDevice())
+      return;
+
+    getOperation()->walk([&](omp::TargetOp targetOp) {
+      if (!hasPrivateVars(targetOp) || !isTargetTaskDeferred(targetOp))
+        return;
+      IRRewriter rewriter(&getContext());
+      OperandRange privateVars = targetOp.getPrivateVars();
+      SmallVector<mlir::Value> newPrivVars;
+      Value fakeDependVar;
+      omp::TaskOp cleanupTaskOp;
+
+      newPrivVars.reserve(privateVars.size());
+      std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
+      for (auto [privVarIdx, privVarSymPair] :
+           llvm::enumerate(llvm::zip_equal(privateVars, *privateSyms))) {
+        Value privVar = std::get<0>(privVarSymPair);
+        Attribute privSym = std::get<1>(privVarSymPair);
+
+        omp::PrivateClauseOp privatizer = findPrivatizer(targetOp, privSym);
+        if (!privatizer.needsMap()) {
+          newPrivVars.push_back(privVar);
+          continue;
+        }
+        bool isFirstPrivate = privatizer.getDataSharingType() ==
+                              omp::DataSharingClauseType::FirstPrivate;
+
+        Value mappedValue = targetOp.getMappedValueForPrivateVar(privVarIdx);
+        auto mapInfoOp = cast<omp::MapInfoOp>(mappedValue.getDefiningOp());
+
+        if (mapInfoOp.getMapCaptureType() == omp::VariableCaptureKind::ByCopy) {
+          newPrivVars.push_back(privVar);
+          continue;
+        }
+
+        // For deferred target tasks (!$omp target nowait), we need to keep
+        // a copy of the original, i.e. host variable being privatized so
+        // that it is available when the target task is eventually executed.
+        // We do this by first allocating as much heap memory as is needed by
+        // the original variable. Then, we use the init and copy regions of the
+        // privatizer, an instance of omp::PrivateClauseOp to set up the heap-
+        // allocated copy.
+        // After the target task is done, we need to use the dealloc region
+        // of the privatizer to clean up everything. We also need to free
+        // the heap memory we allocated. But due to the deferred nature
+        // of the target task, we cannot simply deallocate right after the
+        // omp.target operation else we may end up freeing memory before
+        // its eventual use by the target task. So, we create a dummy
+        // dependence between the target task and new omp.task. In the omp.task,
+        // we do all the cleanup. So, we end up with the following structure
+        //
+        // omp.target map_entries(..) ... nowait depend(out:fakeDependVar) {
+        //   ...
+        //   omp.terminator
+        // }
+        // omp.task depend(in: fakeDependVar) {
+        //   /*cleanup_code*/
+        //   omp.terminator
+        // }
+        // fakeDependVar is the address of the first heap-allocated copy of the
+        // host variable being privatized.
+
+        bool needsCleanupTask = !privatizer.getDeallocRegion().empty();
+
+        // Allocate heap memory that corresponds to the type of memory
+        // pointed to by varPtr
+        // For boxchars this won't be a pointer. But, MapsForPrivatizedSymbols
+        // should have mapped the pointer to the boxchar so use that as varPtr.
+        Value varPtr = mapInfoOp.getVarPtr();
+        Type varType = mapInfoOp.getVarType();
+        bool isPrivatizedByValue =
+            !isa<LLVM::LLVMPointerType>(privVar.getType());
+
+        assert(isa<LLVM::LLVMPointerType>(varPtr.getType()));
+        Value heapMem =
+            allocateHeapMem(targetOp, varPtr, varType, mod, rewriter);
+        if (!heapMem)
+          targetOp.emitError(
+              "Unable to allocate heap memory when trying to move "
+              "a private variable out of the stack and into the "
+              "heap for use by a deferred target task");
+
+        if (needsCleanupTask && !fakeDependVar)
+          fakeDependVar = heapMem;
+
+        // The types of private vars should match before and after the
+        // transformation. In particular, if the type is a pointer,
+        // simply record the newly allocated malloc location as the
+        // new private variable. If, however, the type is not a pointer
+        // then, we need to load the value from the newly allocated
+        // location. We'll insert that load later after we have updated
+        // the malloc'd location with the contents of the original
+        // variable.
+        if (!isPrivatizedByValue)
+          newPrivVars.push_back(heapMem);
+
+        // We now need to copy the original private variable into the newly
+        // allocated location in the heap.
+        // Find the earliest insertion point for the copy. This will be before
+        // the first in the list of omp::MapInfoOp instances that use varPtr.
+        // After the copy these omp::MapInfoOp instances will refer to heapMem
+        // instead.
+        Operation *varPtrDefiningOp = varPtr.getDefiningOp();
+        DenseSet<Operation *> users;
+        if (varPtrDefiningOp) {
+          users.insert(varPtrDefiningOp->user_begin(),
+                       varPtrDefiningOp->user_end());
+        } else {
+          auto blockArg = cast<BlockArgument>(varPtr);
+          users.insert(blockArg.user_begin(), blockArg.user_end());
+        }
+        auto usesVarPtr = [&users](Operation *op) -> bool {
+          return users.count(op);
+        };
+
+        SmallVector<Operation *> chainOfOps;
+        chainOfOps.push_back(mapInfoOp);
+        for (auto member : mapInfoOp.getMembers()) {
+          omp::MapInfoOp memberMap =
+              cast<omp::MapInfoOp>(member.getDefiningOp());
+          if (usesVarPtr(memberMap))
+            chainOfOps.push_back(memberMap);
+          if (memberMap.getVarPtrPtr()) {
+            Operation *defOp = memberMap.getVarPtrPtr().getDefiningOp();
+            if (defOp && usesVarPtr(defOp))
+              chainOfOps.push_back(defOp);
+          }
+        }
+
+        DominanceInfo dom;
+        llvm::sort(chainOfOps, [&](Operation *l, Operation *r) {
+          return dom.dominates(l, r);
+        });
+
+        rewriter.setInsertionPoint(chainOfOps.front());
+
+        Operation *firstOp = chainOfOps.front();
+        Location loc = firstOp->getLoc();
+
+        // Create a llvm.func for 'region' that is marked always_inline and call
+        // it.
+        auto createAlwaysInlineFuncAndCallIt =
+            [&](Region &region, llvm::StringRef funcName,
+                llvm::ArrayRef<Value> args, bool returnsValue) -> Value {
+          assert(!region.empty() && "region cannot be empty");
+          LLVM::LLVMFuncOp func = createFuncOpForRegion(
+              loc, mod, region, funcName, rewriter, returnsValue);
+          auto call = rewriter.create<LLVM::CallOp>(loc, func, args);
+          return call.getResult();
+        };
+
+        Value moldArg, newArg;
+        if (isPrivatizedByValue) {
+          moldArg = rewriter.create<LLVM::LoadOp>(loc, varType, varPtr);
+          newArg = rewriter.create<LLVM::LoadOp>(loc, varType, heapMem);
+        } else {
+          moldArg = varPtr;
+          newArg = heapMem;
+        }
+
+        Value initializedVal;
+        if (!privatizer.getInitRegion().empty())
+          initializedVal = createAlwaysInlineFuncAndCallIt(
+              privatizer.getInitRegion(),
+              llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(),
+              {moldArg, newArg}, /*returnsValue=*/true);
+        else
+          initializedVal = newArg;
+
+        if (isFirstPrivate && !privatizer.getCopyRegion().empty())
+          initializedVal = createAlwaysInlineFuncAndCallIt(
+              privatizer.getCopyRegion(),
+              llvm::formatv("{0}_{1}", privatizer.getSymName(), "copy").str(),
+              {moldArg, initializedVal}, /*returnsValue=*/true);
+
+        if (isPrivatizedByValue)
+          (void)rewriter.create<LLVM::StoreOp>(loc, initializedVal, heapMem);
+
+        // clone origOp, replace all uses of varPtr with heapMem and
+        // erase origOp.
+        auto cloneModifyAndErase = [&](Operation *origOp) -> Operation * {
+          Operation *clonedOp = rewriter.clone(*origOp);
+          rewriter.replaceAllOpUsesWith(origOp, clonedOp);
+          rewriter.modifyOpInPlace(clonedOp, [&]() {
+            clonedOp->replaceUsesOfWith(varPtr, heapMem);
+          });
+          rewriter.eraseOp(origOp);
+          return clonedOp;
+        };
+
+        // Now that we have set up the heap-allocated copy of the private
+        // variable, rewrite all the uses of the original variable with
+        // the heap-allocated variable.
+        rewriter.setInsertionPoint(targetOp);
+        rewriter.setInsertionPoint(cloneModifyAndErase(mapInfoOp));
+
+        // Fix any members that may use varPtr to now use heapMem
+        for (auto member : mapInfoOp.getMembers()) {
+          auto memberMapInfoOp = cast<omp::MapInfoOp>(member.getDefiningOp());
+          if (!usesVarPtr(memberMapInfoOp))
+            continue;
+          rewriter.setInsertionPoint(cloneModifyAndErase(memberMapInfoOp));
+
+          if (memberMapInfoOp.getVarPtrPtr()) {
+            Operation *varPtrPtrdefOp =
+                memberMapInfoOp.getVarPtrPtr().getDefiningOp();
+            rewriter.setInsertionPoint(cloneModifyAndErase(varPtrPtrdefOp));
+          }
+        }
+
+        // If the type of the private variable is not a pointer,
+        // which is typically the case with !fir.boxchar types, then
+        // we need to ensure that the new private variable is also
+        // not a pointer. Insert a load from heapMem right before
+        // targetOp.
+        if (isPrivatizedByValue) {
+          rewriter.setInsertionPoint(targetOp);
+          auto newPrivVar = rewriter.create<LLVM::LoadOp>(mapInfoOp.getLoc(),
+                                                          varType, heapMem);
+          newPrivVars.push_back(newPrivVar);
+        }
+
+        // Deallocate
+        if (needsCleanupTask) {
+          if (!cleanupTaskOp) {
+            assert(fakeDependVar &&
+                   "Need a valid value to set up a dependency");
+            rewriter.setInsertionPointAfter(targetOp);
+            omp::TaskOperands taskOperands;
+            auto inDepend = omp::ClauseTaskDependAttr::get(
+                rewriter.getContext(), omp::ClauseTaskDepend::taskdependin);
+            taskOperands.dependKinds.push_back(inDepend);
+            taskOperands.dependVars.push_back(fakeDependVar);
+            cleanupTaskOp = omp::TaskOp::create(rewriter, loc, taskOperands);
+            Block *taskBlock = rewriter.createBlock(&cleanupTaskOp.getRegion());
+            rewriter.setInsertionPointToEnd(taskBlock);
+            rewriter.create<omp::TerminatorOp>(cleanupTaskOp.getLoc());
+          }
+          rewriter.setInsertionPointToStart(
+              &*cleanupTaskOp.getRegion().getBlocks().begin());
+          (void)createAlwaysInlineFuncAndCallIt(
+              privatizer.getDeallocRegion(),
+              llvm::formatv("{0}_{1}", privatizer.getSymName(), "dealloc")
+                  .str(),
+              {initializedVal}, /*returnsValue=*/false);
+          llvm::FailureOr<LLVM::LLVMFuncOp> freeFunc =
+              LLVM::lookupOrCreateFreeFn(rewriter, mod);
+          assert(llvm::succeeded(freeFunc) &&
+                 "Could not find free in the module");
+          (void)rewriter.create<LLVM::CallOp>(loc, freeFunc.value(),
+                                              ValueRange{heapMem});
+        }
+      }
+      assert(newPrivVars.size() == privateVars.size() &&
+             "The number of private variables must match before and after "
+             "transformation");
+      if (fakeDependVar) {
+        omp::ClauseTaskDependAttr outDepend = omp::ClauseTaskDependAttr::get(
+            rewriter.getContext(), omp::ClauseTaskDepend::taskdependout);
+        SmallVector<Attribute> newDependKinds;
+        if (!targetOp.getDependVars().empty()) {
+          std::optional<ArrayAttr> dependKinds = targetOp.getDependKinds();
+          assert(dependKinds && "bad depend clause in omp::TargetOp");
+          llvm::copy(*dependKinds, std::back_inserter(newDependKinds));
+        }
+        newDependKinds.push_back(outDepend);
+        ArrayAttr newDependKindsAttr =
+            ArrayAttr::get(rewriter.getContext(), newDependKinds);
+        targetOp.getDependVarsMutable().append(fakeDependVar);
+        targetOp.setDependKindsAttr(newDependKindsAttr);
+      }
+      rewriter.setInsertionPoint(targetOp);
+      targetOp.getPrivateVarsMutable().clear();
+      targetOp.getPrivateVarsMutable().assign(newPrivVars);
+    });
+  }
+
+private:
+  bool hasPrivateVars(omp::TargetOp targetOp) const {
+    return !targetOp.getPrivateVars().empty();
+  }
+
+  bool isTargetTaskDeferred(omp::TargetOp targetOp) const {
+    return targetOp.getNowait();
+  }
+
+  template <typename OpTy>
+  omp::PrivateClauseOp findPrivatizer(OpTy op, Attribute privSym) const {
+    SymbolRefAttr privatizerName = llvm::cast<SymbolRefAttr>(privSym);
+    omp::PrivateClauseOp privatizer =
+        SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+            op, privatizerName);
+    return privatizer;
+  }
+
+  // Get the (compile-time constant) size of varType as per the
+  // given DataLayout dl.
+  std::int64_t getSizeInBytes(const DataLayout &dl, Type varType) const {
+    llvm::TypeSize size = dl.getTypeSize(varType);
+    unsigned short alignment = dl.getTypeABIAlignment(varType);
+    return llvm::alignTo(size, alignment);
+  }
+
+  LLVM::LLVMFuncOp getMalloc(ModuleOp mod, IRRewriter &rewriter) const {
+    llvm::FailureOr<LLVM::LLVMFuncOp> mallocCall =
+        LLVM::lookupOrCreateMallocFn(rewriter, mod, rewriter.getI64Type());
+    assert(llvm::succeeded(mallocCall) &&
+           "Could not find malloc in the module");
+    return mallocCall.value();
+  }
+
+  Value allocateHeapMem(omp::TargetOp targetOp, Value privVar, Type varType,
+                        ModuleOp mod, IRRewriter &rewriter) const {
+    OpBuilder::InsertionGuard guard(rewriter);
+    Value varPtr = privVar;
+    Operation *definingOp = varPtr.getDefiningOp();
+    BlockArgument blockArg;
+    if (!definingOp) {
+      blockArg = mlir::dyn_cast<BlockArgument>(varPtr);
+      rewriter.setInsertionPointToStart(blockArg.getParentBlock());
+    } else {
+      rewriter.setInsertionPoint(definingOp);
+    }
+    Location loc = definingOp ? definingOp->getLoc() : blockArg.getLoc();
+    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
+
+    assert(mod.getDataLayoutSpec() &&
+           "MLIR module with no datalayout spec not handled yet");
+
+    const DataLayout &dl = DataLayout(mod);
+    std::int64_t distance = getSizeInBytes(dl, varType);
+
+    Value sizeBytes = rewriter.create<LLVM::ConstantOp>(
+        loc, mallocFn.getFunctionType().getParamType(0), distance);
+
+    auto mallocCallOp =
+        rewriter.create<LLVM::CallOp>(loc, mallocFn, ValueRange{sizeBytes});
+    return mallocCallOp.getResult();
+  }
+
+  // Create a function for srcRegion and attribute it to be always_inline.
+  // The big assumption here is that srcRegion is one of init, copy or dealloc
+  // regions of a omp::PrivateClauseop. Accordingly, the return type is assumed
+  // to either be the same as the types of the two arguments of the region (for
+  // init and copy regions) or void as would be the case for dealloc regions.
+  LLVM::LLVMFuncOp createFuncOpForRegion(Location loc, ModuleOp mod,
+                                         Region &srcRegion,
+                                         llvm::StringRef funcName,
+                                         IRRewriter &rewriter,
+                                         bool returnsValue = false) {
+
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(mod.getBody(), mod.getBody()->end());
+    Region clonedRegion;
+    IRMapping mapper;
+    srcRegion.cloneInto(&clonedRegion, mapper);
+
+    SmallVector<Type> paramTypes;
+    llvm::copy(srcRegion.getArgumentTypes(), std::back_inserter(paramTypes));
+    Type resultType = returnsValue
+                          ? srcRegion.getArgument(0).getType()
+                          : LLVM::LLVMVoidType::get(rewriter.getContext());
+    LLVM::LLVMFunctionType funcType =
+        LLVM::LLVMFunctionType::get(resultType, paramTypes);
+
+    LLVM::LLVMFuncOp func =
+        LLVM::LLVMFuncOp::create(rewriter, loc, funcName, funcType);
+    func.setAlwaysInline(true);
+    rewriter.inlineRegionBefore(clonedRegion, func.getRegion(),
+                                func.getRegion().end());
+    for (auto &block : func.getRegion().getBlocks()) {
+      if (isa<omp::YieldOp>(block.getTerminator())) {
+        omp::YieldOp yieldOp = cast<omp::YieldOp>(block.getTerminator());
+        rewriter.setInsertionPoint(yieldOp);
+        rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(yieldOp, TypeRange(),
+                                                    yieldOp.getOperands());
+      }
+    }
+    return func;
+  }
+};
+} // namespace
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index a9da6c2..9bd13f3 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -2490,8 +2490,8 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
         changed = true;
 
         if (!constantTrue)
-          constantTrue = rewriter.create<arith::ConstantOp>(
-              op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 1));
+          constantTrue = arith::ConstantOp::create(
+              rewriter, op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 1));
 
         rewriter.modifyOpInPlace(use.getOwner(),
                                  [&]() { use.set(constantTrue); });
@@ -2500,8 +2500,8 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
         changed = true;
 
         if (!constantFalse)
-          constantFalse = rewriter.create<arith::ConstantOp>(
-              op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 0));
+          constantFalse = arith::ConstantOp::create(
+              rewriter, op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 0));
 
         rewriter.modifyOpInPlace(use.getOwner(),
                                  [&]() { use.set(constantFalse); });
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index caf8016..99b7cda 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -1001,8 +1001,12 @@ OpFoldResult ArgMaxOp::fold(FoldAdaptor adaptor) {
       !outputTy.hasStaticShape())
     return {};
 
-  if (inputTy.getDimSize(getAxis()) == 1)
-    return DenseElementsAttr::get(outputTy, 0);
+  const Type outputElementTy = getElementTypeOrSelf(outputTy);
+  if (inputTy.getDimSize(getAxis()) == 1 && outputElementTy.isInteger()) {
+    const auto outputElemIntTy = cast<IntegerType>(outputElementTy);
+    const APInt zero = APInt::getZero(outputElemIntTy.getWidth());
+    return DenseElementsAttr::get(outputTy, zero);
+  }
 
   return {};
 }
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
index 8f46ad6..ef49c86 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
@@ -74,9 +74,9 @@ struct MixedSizeInputShuffleOpRewrite final
     for (int64_t i = 0; i < origNumElems; ++i)
       promoteMask[i] = i;
 
-    Value promotedInput = rewriter.create<vector::ShuffleOp>(
-        shuffleOp.getLoc(), promotedType, inputToPromote, inputToPromote,
-        promoteMask);
+    Value promotedInput =
+        vector::ShuffleOp::create(rewriter, shuffleOp.getLoc(), promotedType,
+                                  inputToPromote, inputToPromote, promoteMask);
 
     // Create the final shuffle with the promoted inputs.
     Value promotedV1 = promoteV1 ? promotedInput : shuffleOp.getV1();
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 7c019e7..8b5e950 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -341,13 +341,18 @@ private:
 /// Return the distributed vector type based on the original type and the
 /// distribution map. The map is expected to have a dimension equal to the
 /// original type rank and should be a projection where the results are the
-/// distributed dimensions. The number of results should be equal to the number
+/// distributed dimensions. If the number of results is zero there is no
+/// distribution (i.e. original type is returned).
+/// Otherwise, The number of results should be equal to the number
 /// of warp sizes which is currently limited to 1.
 /// Example: For a vector<16x32x64> distributed with a map(d0, d1, d2) -> (d1)
 /// and a warp size of 16 would distribute the second dimension (associated to
 /// d1) and return vector<16x2x64>
 static VectorType getDistributedType(VectorType originalType, AffineMap map,
                                      int64_t warpSize) {
+  // If the map has zero results, return the original type.
+  if (map.getNumResults() == 0)
+    return originalType;
   SmallVector<int64_t> targetShape(originalType.getShape());
   for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
     unsigned position = map.getDimPosition(i);
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 1599ae9..24e9095 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -736,7 +736,7 @@ OpFoldResult genBinOp(OpFoldResult a, OpFoldResult b, Location loc,
                       OpBuilder &builder) {
   auto aVal = getValueOrCreateConstantIndexOp(builder, loc, a);
   auto bVal = getValueOrCreateConstantIndexOp(builder, loc, b);
-  return builder.create<ArithOp>(loc, aVal, bVal).getResult();
+  return ArithOp::create(builder, loc, aVal, bVal).getResult();
 }
 
 // a helper utility to perform division operation on OpFoldResult and int64_t.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 26770b3..d09dc19 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1505,14 +1505,19 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
       return AffineMap::get(val.getContext());
     // Get the layout of the vector type.
     xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
-    // If no layout is specified, assume the inner most dimension is distributed
-    // for now.
+    // If no layout is specified, that means no distribution.
     if (!layout)
-      return AffineMap::getMultiDimMapWithTargets(
-          vecRank, {static_cast<unsigned int>(vecRank - 1)}, val.getContext());
+      return AffineMap::getMultiDimMapWithTargets(vecRank, {},
+                                                  val.getContext());
+    // Expecting vector and layout rank to match.
+    assert(layout.getRank() == vecRank &&
+           "Expecting vector and layout rank to match");
+    // A dimension is distributed only if layout suggests there are
+    // multiple lanes assigned for this dimension and the shape can be evenly
+    // distributed to those lanes.
     SmallVector<unsigned int> distributedDims;
     for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
-      if (v > 1)
+      if (v > 1 && vecType.getShape()[i] % v == 0)
         distributedDims.push_back(i);
     }
     return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
@@ -1525,15 +1530,13 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   auto warpReduction = [](Location loc, OpBuilder &builder, Value input,
                           vector::CombiningKind kind, uint32_t size) {
     // First reduce on a single thread to get per lane reduction value.
-    Value laneVal = builder.create<vector::ReductionOp>(loc, kind, input);
+    Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
     // Parallel reduction using butterfly shuffles.
     for (uint64_t i = 1; i < size; i <<= 1) {
-      Value shuffled =
-          builder
-              .create<gpu::ShuffleOp>(loc, laneVal, i,
-                                      /*width=*/size,
-                                      /*mode=*/gpu::ShuffleMode::XOR)
-              .getShuffleResult();
+      Value shuffled = gpu::ShuffleOp::create(builder, loc, laneVal, i,
+                                              /*width=*/size,
+                                              /*mode=*/gpu::ShuffleMode::XOR)
+                           .getShuffleResult();
       laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
     }
     return laneVal;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 31a967d..9fc5ad9 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -825,7 +825,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
 
       auto tileAttr = DenseElementsAttr::get(VectorType::get(sgShape, eltType),
                                              baseTileValues);
-      auto baseConstVec = rewriter.create<arith::ConstantOp>(loc, tileAttr);
+      auto baseConstVec = arith::ConstantOp::create(rewriter, loc, tileAttr);
 
       // Get subgroup id
       Value sgId =
@@ -837,25 +837,26 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
 
       SmallVector<Value, 2> strideConsts;
       strideConsts.push_back(
-          rewriter.create<arith::ConstantIndexOp>(loc, colStride));
+          arith::ConstantIndexOp::create(rewriter, loc, colStride));
       if (rows > 1)
         strideConsts.insert(
             strideConsts.begin(),
-            rewriter.create<arith::ConstantIndexOp>(loc, rowStride));
+            arith::ConstantIndexOp::create(rewriter, loc, rowStride));
 
       SmallVector<Value> newConstOps;
       for (auto offsets : *sgOffsets) {
         // Multiply offset with stride, broadcast it and add to baseConstVec
-        Value mulOffset = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+        Value mulOffset = arith::ConstantIndexOp::create(rewriter, loc, 0);
         for (size_t i = 0; i < strideConsts.size(); ++i) {
-          Value mul = rewriter.create<arith::MulIOp>(
-              loc, rewriter.getIndexType(), offsets[i], strideConsts[i]);
-          mulOffset = rewriter.create<arith::AddIOp>(
-              loc, rewriter.getIndexType(), mulOffset, mul);
+          Value mul =
+              arith::MulIOp::create(rewriter, loc, rewriter.getIndexType(),
+                                    offsets[i], strideConsts[i]);
+          mulOffset = arith::AddIOp::create(
+              rewriter, loc, rewriter.getIndexType(), mulOffset, mul);
         }
         // Broadcast to baseConstVec size
-        auto bcastOffset = rewriter.create<vector::BroadcastOp>(
-            loc, baseConstVec.getType(), mulOffset);
+        auto bcastOffset = vector::BroadcastOp::create(
+            rewriter, loc, baseConstVec.getType(), mulOffset);
         auto finalConst =
             arith::AddIOp::create(rewriter, loc, baseConstVec, bcastOffset);
         setLayoutIfNeeded(baseConstVec);
@@ -1138,8 +1139,8 @@ struct WgToSgVectorShapeCastOp
 
     SmallVector<Value> newShapeCastOps;
     for (auto src : adaptor.getSource()) {
-      auto newShapeCast =
-          rewriter.create<vector::ShapeCastOp>(op.getLoc(), newResultType, src);
+      auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(),
+                                                      newResultType, src);
       if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
           !layout.getEffectiveInstDataAsInt().empty())
         xegpu::setDistributeLayoutAttr(newShapeCast->getResult(0),
@@ -1201,9 +1202,9 @@ struct WgToSgMultiDimReductionOp
 
     SmallVector<Value> newReductions;
     for (auto sgSrc : adaptor.getSource()) {
-      auto newOp = rewriter.create<vector::MultiDimReductionOp>(
-          op.getLoc(), newDstType, op.getKind(), sgSrc, adaptor.getAcc()[0],
-          op.getReductionDims());
+      auto newOp = vector::MultiDimReductionOp::create(
+          rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc,
+          adaptor.getAcc()[0], op.getReductionDims());
       if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
           !layout.getEffectiveInstDataAsInt().empty())
         xegpu::setDistributeLayoutAttr(newOp->getResult(0),
diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp
index dd413d2de..d7e321a 100644
--- a/mlir/lib/RegisterAllPasses.cpp
+++ b/mlir/lib/RegisterAllPasses.cpp
@@ -33,6 +33,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/NVGPU/Transforms/Passes.h"
 #include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "mlir/Dialect/Quant/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 #include "mlir/Dialect/SPIRV/Transforms/Passes.h"
@@ -80,6 +81,7 @@ void mlir::registerAllPasses() {
   memref::registerMemRefPasses();
   shard::registerShardPasses();
   ml_program::registerMLProgramPasses();
+  omp::registerOpenMPPasses();
   quant::registerQuantPasses();
   registerSCFPasses();
   registerShapePasses();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index b851414..f284540 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -357,14 +357,8 @@ static LogicalResult checkImplementationStatus(Operation &op) {
       result = todo("priority");
   };
   auto checkPrivate = [&todo](auto op, LogicalResult &result) {
-    if constexpr (std::is_same_v<std::decay_t<decltype(op)>, omp::TargetOp>) {
-      // Privatization is supported only for included target tasks.
-      if (!op.getPrivateVars().empty() && op.getNowait())
-        result = todo("privatization for deferred target tasks");
-    } else {
-      if (!op.getPrivateVars().empty() || op.getPrivateSyms())
-        result = todo("privatization");
-    }
+    if (!op.getPrivateVars().empty() || op.getPrivateSyms())
+      result = todo("privatization");
   };
   auto checkReduction = [&todo](auto op, LogicalResult &result) {
     if (isa<omp::TeamsOp>(op))
@@ -451,7 +445,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkDevice(op, result);
         checkInReduction(op, result);
         checkIsDevicePtr(op, result);
-        checkPrivate(op, result);
       })
       .Default([](Operation &) {
         // Assume all clauses for an operation can be translated unless they are
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index d9ad8fb..6492708 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -702,8 +702,8 @@ spirv::Deserializer::processGraphEntryPointARM(ArrayRef<uint32_t> operands) {
   // RAII guard to reset the insertion point to previous value when done.
   OpBuilder::InsertionGuard insertionGuard(opBuilder);
   opBuilder.setInsertionPoint(graphARM);
-  opBuilder.create<spirv::GraphEntryPointARMOp>(
-      unknownLoc, SymbolRefAttr::get(opBuilder.getContext(), name),
+  spirv::GraphEntryPointARMOp::create(
+      opBuilder, unknownLoc, SymbolRefAttr::get(opBuilder.getContext(), name),
       opBuilder.getArrayAttr(interface));
 
   return success();
@@ -736,7 +736,7 @@ spirv::Deserializer::processGraphARM(ArrayRef<uint32_t> operands) {
 
   std::string graphName = getGraphSymbol(graphID);
   auto graphOp =
-      opBuilder.create<spirv::GraphARMOp>(unknownLoc, graphName, graphType);
+      spirv::GraphARMOp::create(opBuilder, unknownLoc, graphName, graphType);
   curGraph = graphMap[graphID] = graphOp;
   Block *entryBlock = graphOp.addEntryBlock();
   LLVM_DEBUG({
@@ -844,7 +844,7 @@ spirv::Deserializer::processOpGraphSetOutputARM(ArrayRef<uint32_t> operands) {
 LogicalResult
 spirv::Deserializer::processGraphEndARM(ArrayRef<uint32_t> operands) {
   // Create GraphOutputsARM instruction.
-  opBuilder.create<spirv::GraphOutputsARMOp>(unknownLoc, graphOutputs);
+  spirv::GraphOutputsARMOp::create(opBuilder, unknownLoc, graphOutputs);
 
   // Process OpGraphEndARM.
   if (!operands.empty()) {
diff --git a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
index 366ba8f..048e964 100644
--- a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
+++ b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
@@ -406,7 +406,7 @@ private:
     auto returnOperands = popOperands(resTypes);
     if (failed(returnOperands))
       return failure();
-    builder.create<BlockReturnOp>(opLoc, *returnOperands);
+    BlockReturnOp::create(builder, opLoc, *returnOperands);
     LDBG() << "end of parsing of a block";
     return bodyParsingRes->endingByte;
   }
@@ -1000,7 +1000,7 @@ parsed_inst_t ExpressionParser::parseBlockLikeOp(OpBuilder &builder) {
       builder.createBlock(curRegion, curRegion->end(), resTypes, locations);
   builder.setInsertionPointToEnd(curBlock);
   auto blockOp =
-      builder.create<OpToCreate>(*currentOpLoc, *inputOps, successor);
+      OpToCreate::create(builder, *currentOpLoc, *inputOps, successor);
   auto *blockBody = blockOp.createBlock();
   if (failed(parseBlockContent(builder, blockBody, resTypes, *opLoc, blockOp)))
     return failure();
@@ -1047,8 +1047,8 @@ inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
   auto *successor =
       builder.createBlock(curRegion, curRegion->end(), resTypes, locations);
   builder.setInsertionPointToEnd(curBlock);
-  auto ifOp = builder.create<IfOp>(*currentOpLoc, conditionValue->front(),
-                                   *inputOps, successor);
+  auto ifOp = IfOp::create(builder, *currentOpLoc, conditionValue->front(),
+                           *inputOps, successor);
   auto *ifEntryBlock = ifOp.createIfBlock();
   constexpr auto ifElseFilter =
       ByteSequence<WasmBinaryEncoding::endByte,
@@ -1091,9 +1091,9 @@ inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
   auto branchArgs = popOperands(inputTypes);
   if (failed(branchArgs))
     return failure();
-  builder.create<BranchIfOp>(*currentOpLoc, condition->front(),
-                             builder.getUI32IntegerAttr(*level), *branchArgs,
-                             elseBlock);
+  BranchIfOp::create(builder, *currentOpLoc, condition->front(),
+                     builder.getUI32IntegerAttr(*level), *branchArgs,
+                     elseBlock);
   builder.setInsertionPointToStart(elseBlock);
   return {*branchArgs};
 }
@@ -1115,7 +1115,7 @@ ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::call>(
   if (failed(inOperands))
     return failure();
   auto callOp =
-      builder.create<FuncCallOp>(loc, resTypes, callee.symbol, *inOperands);
+      FuncCallOp::create(builder, loc, resTypes, callee.symbol, *inOperands);
   return {callOp.getResults()};
 }
 
@@ -1391,8 +1391,8 @@ inline parsed_inst_t ExpressionParser::buildConvertOp(OpBuilder &builder,
   auto operand = popOperands(intype);
   if (failed(operand))
     return failure();
-  auto op = builder.create<opType>(*currentOpLoc, outType, operand->front(),
-                                   extraArgs...);
+  auto op = opType::create(builder, *currentOpLoc, outType, operand->front(),
+                           extraArgs...);
   LDBG() << "Built operation: " << op;
   return {{op.getResult()}};
 }
diff --git a/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir b/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir
new file mode 100644
index 0000000..af52bef
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir
@@ -0,0 +1,88 @@
+// RUN: mlir-opt %s -split-input-file -test-acc-support | FileCheck %s
+
+// Test with direct variable names
+func.func @test_direct_var_name() {
+  // Create a memref with acc.var_name attribute
+  %0 = memref.alloca() {acc.var_name = #acc.var_name<"my_variable">} : memref<10xi32>
+
+  %1 = memref.cast %0 {test.var_name} : memref<10xi32> to memref<10xi32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32>
+  // CHECK-NEXT: getVariableName="my_variable"
+
+  return
+}
+
+// -----
+
+// Test through memref.cast
+func.func @test_through_cast() {
+  // Create a 5x2 memref with acc.var_name attribute
+  %0 = memref.alloca() {acc.var_name = #acc.var_name<"casted_variable">} : memref<5x2xi32>
+
+  // Cast to dynamic dimensions
+  %1 = memref.cast %0 : memref<5x2xi32> to memref<?x?xi32>
+
+  // Mark with test attribute - should find name through cast
+  %2 = memref.cast %1 {test.var_name} : memref<?x?xi32> to memref<5x2xi32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<?x?xi32> to memref<5x2xi32>
+  // CHECK-NEXT: getVariableName="casted_variable"
+
+  return
+}
+
+// -----
+
+// Test with no variable name
+func.func @test_no_var_name() {
+  // Create a memref without acc.var_name attribute
+  %0 = memref.alloca() : memref<10xi32>
+
+  // Mark with test attribute - should find empty string
+  %1 = memref.cast %0 {test.var_name} : memref<10xi32> to memref<10xi32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32>
+  // CHECK-NEXT: getVariableName=""
+
+  return
+}
+
+// -----
+
+// Test through multiple casts
+func.func @test_multiple_casts() {
+  // Create a memref with acc.var_name attribute
+  %0 = memref.alloca() {acc.var_name = #acc.var_name<"multi_cast">} : memref<10xi32>
+
+  // Multiple casts
+  %1 = memref.cast %0 : memref<10xi32> to memref<?xi32>
+  %2 = memref.cast %1 : memref<?xi32> to memref<10xi32>
+
+  // Mark with test attribute - should find name through multiple casts
+  %3 = memref.cast %2 {test.var_name} : memref<10xi32> to memref<10xi32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32>
+  // CHECK-NEXT: getVariableName="multi_cast"
+
+  return
+}
+
+// -----
+
+// Test with acc.copyin operation
+func.func @test_copyin_name() {
+  // Create a memref
+  %0 = memref.alloca() : memref<10xf32>
+
+  // Create an acc.copyin operation with a name
+  %1 = acc.copyin varPtr(%0 : memref<10xf32>) -> memref<10xf32> {name = "input_data"}
+
+  // Mark with test attribute - should find name from copyin operation
+  %2 = memref.cast %1 {test.var_name} : memref<10xf32> to memref<?xf32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xf32> to memref<?xf32>
+  // CHECK-NEXT: getVariableName="input_data"
+
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir
new file mode 100644
index 0000000..8972a08
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir
@@ -0,0 +1,157 @@
+// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+
+  omp.private {type = firstprivate} @private_eye : i32 copy {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.load %arg0 : !llvm.ptr -> i32
+    llvm.store %0, %arg1 : i32, !llvm.ptr
+    omp.yield(%arg1 : !llvm.ptr)
+  }
+  omp.private {type = firstprivate} @boxchar_firstprivate : !llvm.struct<(ptr, i64)> init {
+  ^bb0(%arg0: !llvm.struct<(ptr, i64)>, %arg1: !llvm.struct<(ptr, i64)>):
+    %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)>
+    %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)>
+    %8 = llvm.call @malloc(%1) {bindc_name = "", uniq_name = ""} : (i64) -> !llvm.ptr
+    %9 = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+    %10 = llvm.insertvalue %8, %9[0] : !llvm.struct<(ptr, i64)>
+    %11 = llvm.insertvalue %1, %10[1] : !llvm.struct<(ptr, i64)>
+    omp.yield(%11 : !llvm.struct<(ptr, i64)>)
+  } copy {
+  ^bb0(%arg0: !llvm.struct<(ptr, i64)>, %arg1: !llvm.struct<(ptr, i64)>):
+    %3 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)>
+    %4 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)>
+    %5 = llvm.extractvalue %arg1[0] : !llvm.struct<(ptr, i64)>
+    %6 = llvm.extractvalue %arg1[1] : !llvm.struct<(ptr, i64)>
+    %7 = llvm.icmp "slt" %6, %4 : i64
+    %8 = llvm.select %7, %6, %4 : i1, i64
+    "llvm.intr.memmove"(%5, %3, %8) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+    omp.yield(%arg1 : !llvm.struct<(ptr, i64)>)
+  } dealloc {
+  ^bb0(%arg0: !llvm.struct<(ptr, i64)>):
+    %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)>
+    %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)>
+    llvm.call @free(%0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+
+  llvm.func @target_boxchar_(%arg0: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    %2 = llvm.mlir.constant(1 : i64) : i64
+    %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr
+    %4 = llvm.mlir.constant(1 : index) : i64
+    %5 = llvm.mlir.constant(0 : index) : i64
+    %6 = llvm.mlir.constant(0 : i32) : i32
+    %7 = llvm.mlir.constant(1 : i64) : i64
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.load %arg0 : !llvm.ptr -> i32
+    %10 = llvm.icmp "sgt" %9, %6 : i32
+    %11 = llvm.select %10, %9, %6 : i1, i32
+    %12 = llvm.mlir.constant(1 : i64) : i64
+    %13 = llvm.sext %11 : i32 to i64
+    %14 = llvm.alloca %13 x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr
+    %15 = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+    %16 = llvm.sext %11 : i32 to i64
+    %17 = llvm.insertvalue %14, %15[0] : !llvm.struct<(ptr, i64)>
+    %18 = llvm.insertvalue %16, %17[1] : !llvm.struct<(ptr, i64)>
+    llvm.store %18, %3 : !llvm.struct<(ptr, i64)>, !llvm.ptr
+    %19 = llvm.load %3 : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+    %20 = llvm.extractvalue %19[0] : !llvm.struct<(ptr, i64)>
+    %21 = llvm.extractvalue %19[1] : !llvm.struct<(ptr, i64)>
+    %22 = llvm.sub %21, %4 : i64
+    %23 = omp.map.bounds lower_bound(%5 : i64) upper_bound(%22 : i64) extent(%21 : i64) stride(%4 : i64) start_idx(%5 : i64) {stride_in_bytes = true}
+    %24 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)>
+    %25 = omp.map.info var_ptr(%3 : !llvm.ptr, i8) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%24 : !llvm.ptr) bounds(%23) -> !llvm.ptr
+    %26 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) members(%25 : [0] : !llvm.ptr) -> !llvm.ptr
+    %27 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr
+    omp.target nowait map_entries(%26 -> %arg1, %27 -> %arg2, %25 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@boxchar_firstprivate %18 -> %arg4 [map_idx=0], @private_eye %1 -> %arg5 [map_idx=1] : !llvm.struct<(ptr, i64)>, !llvm.ptr) {
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+// CHECK-LABEL:   llvm.func @target_boxchar_(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(16 : i64) : i64
+// CHECK: %[[HEAP0:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_2]] x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_8:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_11:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> i32
+// CHECK: %[[VAL_12:.*]] = llvm.icmp "sgt" %[[VAL_11]], %[[VAL_8]] : i32
+// CHECK: %[[VAL_13:.*]] = llvm.select %[[VAL_12]], %[[VAL_11]], %[[VAL_8]] : i1, i32
+// CHECK: %[[VAL_14:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_15:.*]] = llvm.sext %[[VAL_13]] : i32 to i64
+// CHECK: %[[VAL_16:.*]] = llvm.alloca %[[VAL_15]] x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_17:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_18:.*]] = llvm.sext %[[VAL_13]] : i32 to i64
+// CHECK: %[[VAL_19:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_17]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_20:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_19]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: llvm.store %[[VAL_20]], %[[VAL_5]] : !llvm.struct<(ptr, i64)>, !llvm.ptr
+// CHECK: %[[VAL_21:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_21]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_21]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_24:.*]] = llvm.sub %[[VAL_23]], %[[VAL_6]] : i64
+// CHECK: %[[VAL_25:.*]] = omp.map.bounds lower_bound(%[[VAL_7]] : i64) upper_bound(%[[VAL_24]] : i64) extent(%[[VAL_23]] : i64) stride(%[[VAL_6]] : i64) start_idx(%[[VAL_7]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_26:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_27:.*]] = llvm.load %[[HEAP0]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_28:.*]] = llvm.call @boxchar_firstprivate_init(%[[VAL_26]], %[[VAL_27]]) : (!llvm.struct<(ptr, i64)>, !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_29:.*]] = llvm.call @boxchar_firstprivate_copy(%[[VAL_26]], %[[VAL_28]]) : (!llvm.struct<(ptr, i64)>, !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)>
+// CHECK: llvm.store %[[VAL_29]], %[[HEAP0]] : !llvm.struct<(ptr, i64)>, !llvm.ptr
+// CHECK: %[[VAL_30:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr
+// CHECK: %[[VAL_31:.*]] = llvm.getelementptr %[[HEAP0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_32:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, i8) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_31]] : !llvm.ptr) bounds(%[[VAL_25]]) -> !llvm.ptr
+// CHECK: %[[VAL_33:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) members(%[[VAL_32]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_34:.*]] = llvm.load %[[HEAP0]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: omp.target depend(taskdependout -> %[[HEAP0]] : !llvm.ptr) nowait map_entries(%[[VAL_33]] -> %[[VAL_35:.*]], %[[VAL_30]] -> %[[VAL_36:.*]], %[[VAL_32]] -> %[[VAL_37:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@boxchar_firstprivate %[[VAL_34]] -> %[[VAL_38:.*]] [map_idx=0], @private_eye %[[VAL_1]] -> %[[VAL_39:.*]] [map_idx=1] : !llvm.struct<(ptr, i64)>, !llvm.ptr) {
+// CHECK: omp.terminator
+// CHECK: }
+// CHECK: omp.task depend(taskdependin -> %[[HEAP0]] : !llvm.ptr) {
+// CHECK: llvm.call @boxchar_firstprivate_dealloc(%[[VAL_29]]) : (!llvm.struct<(ptr, i64)>) -> ()
+// CHECK: llvm.call @free(%[[HEAP0]]) : (!llvm.ptr) -> ()
+// CHECK: omp.terminator
+// CHECK: }
+// CHECK: llvm.return
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @boxchar_firstprivate_init(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_2:.*]] = llvm.call @malloc(%[[VAL_1]]) {bindc_name = "", uniq_name = ""} : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_4:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_3]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_4]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: llvm.return %[[VAL_5]] : !llvm.struct<(ptr, i64)>
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @boxchar_firstprivate_copy(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[ARG1]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_3:.*]] = llvm.extractvalue %[[ARG1]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_4:.*]] = llvm.icmp "slt" %[[VAL_3]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_5:.*]] = llvm.select %[[VAL_4]], %[[VAL_3]], %[[VAL_1]] : i1, i64
+// CHECK: "llvm.intr.memmove"(%[[VAL_2]], %[[VAL_0]], %[[VAL_5]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// CHECK: llvm.return %[[ARG1]] : !llvm.struct<(ptr, i64)>
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @boxchar_firstprivate_dealloc(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>) attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: llvm.call @free(%[[VAL_0]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK: }
diff --git a/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir
new file mode 100644
index 0000000..0377d49
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir
@@ -0,0 +1,201 @@
+// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+
+  omp.private {type = firstprivate} @firstprivatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i64) : i64
+    %1 = llvm.call @malloc(%0) : (i64) -> !llvm.ptr
+    %2 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %1, %2 : !llvm.ptr, !llvm.ptr
+    omp.yield(%arg1 : !llvm.ptr)
+  } copy {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i32) : i32
+    "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  } dealloc {
+  ^bb0(%arg0: !llvm.ptr):
+    llvm.call @free(%arg0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+  omp.private {type = firstprivate} @firstprivatizer_1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i64) : i64
+    %1 = llvm.call @malloc(%0) : (i64) -> !llvm.ptr
+    %2 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %1, %2 : !llvm.ptr, !llvm.ptr
+    omp.yield(%arg1 : !llvm.ptr)
+  } copy {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i32) : i32
+    "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  } dealloc {
+  ^bb0(%arg0: !llvm.ptr):
+    llvm.call @free(%arg0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+
+  llvm.func internal @firstprivate_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.mlir.constant(0 : index) : i64
+    %5 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %19 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+    %20 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "glocal"} : (i32) -> !llvm.ptr
+    %21 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+    %33 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %33, %19 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+    llvm.store %33, %20 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+    llvm.store %0, %21 : i32, !llvm.ptr
+    %124 = omp.map.info var_ptr(%21 : !llvm.ptr, i32) map_clauses(implicit) capture(ByCopy) -> !llvm.ptr {name = "i"}
+    %150 = llvm.getelementptr %19[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %151 = llvm.load %150 : !llvm.ptr -> i64
+    %152 = llvm.getelementptr %19[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %153 = llvm.load %152 : !llvm.ptr -> i64
+    %154 = llvm.getelementptr %19[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %155 = llvm.load %154 : !llvm.ptr -> i64
+    %156 = llvm.sub %153, %1 : i64
+    %157 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%156 : i64) extent(%153 : i64) stride(%155 : i64) start_idx(%151 : i64) {stride_in_bytes = true}
+    %158 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %159 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%158 : !llvm.ptr) bounds(%157) -> !llvm.ptr {name = ""}
+    %160 = omp.map.info var_ptr(%19 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%159 : [0] : !llvm.ptr) -> !llvm.ptr
+    %1501 = llvm.getelementptr %20[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %1511 = llvm.load %1501 : !llvm.ptr -> i64
+    %1521 = llvm.getelementptr %20[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %1531 = llvm.load %1521 : !llvm.ptr -> i64
+    %1541 = llvm.getelementptr %20[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %1551 = llvm.load %1541 : !llvm.ptr -> i64
+    %1561 = llvm.sub %1531, %1 : i64
+    %1571 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%1561 : i64) extent(%1531 : i64) stride(%1551 : i64) start_idx(%1511 : i64) {stride_in_bytes = true}
+    %1581 = llvm.getelementptr %20[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %1591 = omp.map.info var_ptr(%20 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%1581 : !llvm.ptr) bounds(%1571) -> !llvm.ptr {name = ""}
+    %1601 = omp.map.info var_ptr(%20 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%1591 : [0] : !llvm.ptr) -> !llvm.ptr
+
+    // Test with two firstprivate variables so that we test that even if there are multiple variables to be cleaned up
+    // only one cleanup omp.task is generated.
+    omp.target nowait map_entries(%124 -> %arg2, %160 -> %arg5, %159 -> %arg8, %1601 -> %arg9, %1591 -> %arg10  : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %19 -> %arg11 [map_idx=1], @firstprivatizer_1 %20 -> %arg12 [map_idx=3] : !llvm.ptr, !llvm.ptr) {
+      omp.terminator
+    }
+    %166 = llvm.mlir.constant(48 : i32) : i32
+    %167 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %168 = llvm.load %167 : !llvm.ptr -> !llvm.ptr
+    llvm.call @free(%168) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+
+}
+// CHECK-LABEL:   llvm.func @free(!llvm.ptr)
+// CHECK: llvm.func @malloc(i64) -> !llvm.ptr
+
+
+// CHECK-LABEL:   llvm.func internal @firstprivate_test(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr {fir.bindc_name = "ptr0"},
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.ptr {fir.bindc_name = "ptr1"}) {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_2:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[HEAP0:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[HEAP1:.*]] = llvm.call @malloc(%[[VAL_6]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_8:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "glocal"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_9:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_10:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_10]], %[[VAL_5]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: llvm.store %[[VAL_10]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: llvm.store %[[VAL_0]], %[[VAL_9]] : i32, !llvm.ptr
+// CHECK: %[[VAL_11:.*]] = omp.map.info var_ptr(%[[VAL_9]] : !llvm.ptr, i32) map_clauses(implicit) capture(ByCopy) -> !llvm.ptr {name = "i"}
+// CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_15:.*]] = llvm.load %[[VAL_14]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_16:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_17:.*]] = llvm.load %[[VAL_16]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_18:.*]] = llvm.sub %[[VAL_15]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_19:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_18]] : i64) extent(%[[VAL_15]] : i64) stride(%[[VAL_17]] : i64) start_idx(%[[VAL_13]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_20:.*]] = llvm.call @firstprivatizer_init(%[[VAL_5]], %[[HEAP0]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_21:.*]] = llvm.call @firstprivatizer_copy(%[[VAL_5]], %[[VAL_20]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_22:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_23:.*]] = llvm.load %[[VAL_22]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_24:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_25:.*]] = llvm.load %[[VAL_24]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_27:.*]] = llvm.load %[[VAL_26]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_28:.*]] = llvm.sub %[[VAL_25]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_29:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_28]] : i64) extent(%[[VAL_25]] : i64) stride(%[[VAL_27]] : i64) start_idx(%[[VAL_23]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_30:.*]] = llvm.call @firstprivatizer_1_init(%[[VAL_8]], %[[HEAP1]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_31:.*]] = llvm.call @firstprivatizer_1_copy(%[[VAL_8]], %[[VAL_30]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_32:.*]] = llvm.getelementptr %[[HEAP0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_33:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, i32) map_clauses({{.*}}to{{.*}}) capture(ByRef) var_ptr_ptr(%[[VAL_32]] : !llvm.ptr) bounds(%[[VAL_19]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_34:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always,{{.*}}to) capture(ByRef) members(%[[VAL_33]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_35:.*]] = llvm.getelementptr %[[HEAP1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_36:.*]] = omp.map.info var_ptr(%[[HEAP1]] : !llvm.ptr, i32) map_clauses({{.*}}to{{.*}}) capture(ByRef) var_ptr_ptr(%[[VAL_35]] : !llvm.ptr) bounds(%[[VAL_29]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_37:.*]] = omp.map.info var_ptr(%[[HEAP1]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always,{{.*}}to) capture(ByRef) members(%[[VAL_36]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: omp.target depend(taskdependout -> %[[HEAP0]] : !llvm.ptr) nowait map_entries(%[[VAL_11]] -> %[[VAL_38:.*]], %[[VAL_34]] -> %[[VAL_39:.*]], %[[VAL_33]] -> %[[VAL_40:.*]], %[[VAL_37]] -> %[[VAL_41:.*]], %[[VAL_36]] -> %[[VAL_42:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %[[HEAP0]] -> %[[VAL_43:.*]] [map_idx=1], @firstprivatizer_1 %[[HEAP1]] -> %[[VAL_44:.*]] [map_idx=3] : !llvm.ptr, !llvm.ptr) {
+// CHECK: omp.terminator
+// CHECK: }
+// CHECK: omp.task depend(taskdependin -> %[[HEAP0]] : !llvm.ptr) {
+// CHECK: llvm.call @firstprivatizer_1_dealloc(%[[VAL_31]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.call @free(%[[HEAP1]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.call @firstprivatizer_dealloc(%[[VAL_21]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.call @free(%[[HEAP0]]) : (!llvm.ptr) -> ()
+// CHECK: omp.terminator
+// CHECK: }
+// CHECK: %[[VAL_45:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK: %[[VAL_46:.*]] = llvm.getelementptr %[[VAL_5]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_47:.*]] = llvm.load %[[VAL_46]] : !llvm.ptr -> !llvm.ptr
+// CHECK: llvm.call @free(%[[VAL_47]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_init(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[VAL_1:.*]] = llvm.call @malloc(%[[VAL_0]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_2:.*]] = llvm.getelementptr %[[ARG1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_1]], %[[VAL_2]] : !llvm.ptr, !llvm.ptr
+// CHECK: llvm.return %[[ARG1]] : !llvm.ptr
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_copy(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK: "llvm.intr.memcpy"(%[[ARG1]], %[[ARG0]], %[[VAL_0]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK: llvm.return %[[ARG1]] : !llvm.ptr
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_dealloc(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr) attributes {always_inline} {
+// CHECK: llvm.call @free(%[[ARG0]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_1_init(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[VAL_1:.*]] = llvm.call @malloc(%[[VAL_0]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_2:.*]] = llvm.getelementptr %[[ARG1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_1]], %[[VAL_2]] : !llvm.ptr, !llvm.ptr
+// CHECK: llvm.return %[[ARG1]] : !llvm.ptr
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_1_copy(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK: "llvm.intr.memcpy"(%[[ARG1]], %[[ARG0]], %[[VAL_0]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK: llvm.return %[[ARG1]] : !llvm.ptr
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_1_dealloc(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr) attributes {always_inline} {
+// CHECK: llvm.call @free(%[[ARG0]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK: }
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index e8525a5..7574afa 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -9,6 +9,15 @@ func.func @argmax_nofold(%arg0: tensor<?x1xf32>) -> tensor<1xi32> {
 
 // -----
 
+// CHECK-LABEL: @test_argmax_fold_i64_index
+func.func @test_argmax_fold_i64_index(%arg0: tensor<1xi8>) -> tensor<i64> {
+  // CHECK: "tosa.const"() <{values = dense<0> : tensor<i64>}> : () -> tensor<i64>
+  %0 = tosa.argmax %arg0 {axis = 0 : i32} : (tensor<1xi8>) -> tensor<i64>
+  return %0 : tensor<i64>
+}
+
+// -----
+
 // CHECK-LABEL: @pad_wh_avg_pool2d_fold
 func.func @pad_wh_avg_pool2d_fold(%input: tensor<1x10x8x3xf32>) -> tensor<1x6x5x3xf32> {
   // CHECK-NOT: tosa.pad
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 0e1365a..27a3dc3 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -214,3 +214,54 @@ gpu.module @xevm_module{
 
   }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result(
+// CHECK:         %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index,
+// CHECK-SAME:      !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME:      memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
+// CHECK:           gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32>
+// CHECK:         }
+// CHECK:         %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args
+// CHECK-SAME:      (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) {
+// CHECK:           %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:        args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
+// CHECK:             gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32>
+// CHECK:           }
+// CHECK:           scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32>
+// CHECK:         }
+gpu.module @xevm_module{
+  gpu.func @warp_scf_for_unused_uniform_for_result(%arg0: index,
+    %arg1: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+    %arg2: memref<16x16xf32>) {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %ini = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<16x1xf32>)
+    %ini2 = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<16x16xf32>)
+    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini2, %arg5 = %ini) -> (vector<16x16xf32>, vector<16x1xf32>) {
+      %1  = "some_def"(%arg5)
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : (vector<16x1xf32>) -> (vector<16x1xf32>)
+      %acc = "some_def"(%arg4, %1)
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : (vector<16x16xf32>, vector<16x1xf32>) -> (vector<16x16xf32>)
+      scf.yield %acc, %1 : vector<16x16xf32>, vector<16x1xf32>
+    }
+    {
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    xegpu.store_nd %3#0, %arg1[%c0, %c0]
+      : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
index 9f4393e..127ab70 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
@@ -103,6 +103,17 @@ func.func @main() {
   // CHECK: unexpected negative result on dimension #0 of input/output operand #0
   func.call @reverse_from_3(%d5x) : (tensor<?xf32>) -> (tensor<?xf32>)
 
+  %c0x = arith.constant dense<1.0> : tensor<0xf32>
+  %d0x = tensor.cast %c0x : tensor<0xf32> to tensor<?xf32>
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @fill_empty_1d(%d0x) : (tensor<?xf32>) -> (tensor<?xf32>)
+
+  %c0x5 = arith.constant dense<0.0> : tensor<0x5xf32>
+  %d0x5 = tensor.cast %c0x5 : tensor<0x5xf32> to tensor<?x?xf32>
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @fill_empty_2d(%d0x5) : (tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
   return
 }
 
@@ -297,3 +308,15 @@ func.func @reverse_from_3(%arg0: tensor<?xf32>) -> (tensor<?xf32>) {
   } -> tensor<?xf32>
   return %result : tensor<?xf32>
 }
+
+func.func @fill_empty_1d(%arg0: tensor<?xf32>) -> (tensor<?xf32>) {
+  %c0 = arith.constant 0.0 : f32
+  %0 = linalg.fill ins(%c0 : f32) outs(%arg0 : tensor<?xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+func.func @fill_empty_2d(%arg0: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
+  %c0 = arith.constant 0.0 : f32
+  %0 = linalg.fill ins(%c0 : f32) outs(%arg0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 2fa4470..af6d254 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -249,24 +249,6 @@ llvm.func @target_is_device_ptr(%x : !llvm.ptr) {
 
 // -----
 
-omp.private {type = firstprivate} @x.privatizer : i32 copy {
-^bb0(%mold: !llvm.ptr, %private: !llvm.ptr):
-  %0 = llvm.load %mold : !llvm.ptr -> i32
-  llvm.store %0, %private : i32, !llvm.ptr
-  omp.yield(%private: !llvm.ptr)
-}
-llvm.func @target_firstprivate(%x : !llvm.ptr) {
-  %0 = omp.map.info var_ptr(%x : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr
-  // expected-error@below {{not yet implemented: Unhandled clause privatization for deferred target tasks in omp.target operation}}
-  // expected-error@below {{LLVM Translation failed for operation: omp.target}}
-  omp.target nowait map_entries(%0 -> %blockarg0 : !llvm.ptr) private(@x.privatizer %x -> %arg0 [map_idx=0] : !llvm.ptr) {
-    omp.terminator
-  }
-  llvm.return
-}
-
-// -----
-
 llvm.func @target_enter_data_depend(%x: !llvm.ptr) {
   // expected-error@below {{not yet implemented: Unhandled clause depend in omp.target_enter_data operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target_enter_data}}
diff --git a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
index 1e59338..a54b642 100644
--- a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
@@ -2,6 +2,7 @@ add_mlir_library(MLIROpenACCTestPasses
   TestOpenACC.cpp
   TestPointerLikeTypeInterface.cpp
   TestRecipePopulate.cpp
+  TestOpenACCSupport.cpp
   
   EXCLUDE_FROM_LIBMLIR
 )
@@ -11,6 +12,7 @@ mlir_target_link_libraries(MLIROpenACCTestPasses PUBLIC
   MLIRFuncDialect
   MLIRMemRefDialect
   MLIROpenACCDialect
+  MLIROpenACCAnalysis
   MLIRPass
   MLIRSupport
 )
diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
index bea21b9..e59d777 100644
--- a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
+++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
@@ -16,11 +16,13 @@ namespace test {
 // Forward declarations of individual test pass registration functions
 void registerTestPointerLikeTypeInterfacePass();
 void registerTestRecipePopulatePass();
+void registerTestOpenACCSupportPass();
 
 // Unified registration function for all OpenACC tests
 void registerTestOpenACC() {
   registerTestPointerLikeTypeInterfacePass();
   registerTestRecipePopulatePass();
+  registerTestOpenACCSupportPass();
 }
 
 } // namespace test
diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp
new file mode 100644
index 0000000..8bf984b
--- /dev/null
+++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp
@@ -0,0 +1,73 @@
+//===- TestOpenACCSupport.cpp - Test OpenACCSupport Analysis -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains test passes for testing the OpenACCSupport analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+namespace {
+
+struct TestOpenACCSupportPass
+    : public PassWrapper<TestOpenACCSupportPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOpenACCSupportPass)
+
+  StringRef getArgument() const override { return "test-acc-support"; }
+
+  StringRef getDescription() const override {
+    return "Test OpenACCSupport analysis";
+  }
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<acc::OpenACCDialect>();
+    registry.insert<memref::MemRefDialect>();
+  }
+};
+
+void TestOpenACCSupportPass::runOnOperation() {
+  auto func = getOperation();
+
+  // Get the OpenACCSupport analysis
+  OpenACCSupport &support = getAnalysis<OpenACCSupport>();
+
+  // Walk through operations looking for test attributes
+  func.walk([&](Operation *op) {
+    // Check for test.var_name attribute. This is the marker used to identify
+    // the operations that need to be tested for getVariableName.
+    if (op->hasAttr("test.var_name")) {
+      // For each result of this operation, try to get the variable name
+      for (auto result : op->getResults()) {
+        std::string foundName = support.getVariableName(result);
+        llvm::outs() << "op=" << *op << "\n\tgetVariableName=\"" << foundName
+                     << "\"\n";
+      }
+    }
+  });
+}
+
+} // namespace
+
+namespace mlir {
+namespace test {
+
+void registerTestOpenACCSupportPass() {
+  PassRegistration<TestOpenACCSupportPass>();
+}
+
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index ee4fa39..efbdbfb 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -2136,7 +2136,7 @@ struct TestTypeConversionDriver
                                           Location loc) -> Value {
       if (inputs.size() != 1 || !inputs[0].getType().isInteger(37))
         return Value();
-      return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+      return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
           .getResult(0);
     });
 
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
index ab817b6..3fbbcc9 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
@@ -410,3 +410,78 @@ TEST_F(OpenACCUtilsTest, getTypeCategoryArray) {
   VariableTypeCategory category = getTypeCategory(varPtr);
   EXPECT_EQ(category, VariableTypeCategory::array);
 }
+
+//===----------------------------------------------------------------------===//
+// getVariableName Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTest, getVariableNameDirect) {
+  // Create a memref with acc.var_name attribute
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+
+  // Set the acc.var_name attribute
+  auto varNameAttr = VarNameAttr::get(&context, "my_variable");
+  allocOp.get()->setAttr(getVarNameAttrName(), varNameAttr);
+
+  Value varPtr = allocOp->getResult();
+
+  // Test that getVariableName returns the variable name
+  std::string varName = getVariableName(varPtr);
+  EXPECT_EQ(varName, "my_variable");
+}
+
+TEST_F(OpenACCUtilsTest, getVariableNameThroughCast) {
+  // Create a 5x2 memref with acc.var_name attribute
+  auto memrefTy = MemRefType::get({5, 2}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+
+  // Set the acc.var_name attribute on the alloca
+  auto varNameAttr = VarNameAttr::get(&context, "casted_variable");
+  allocOp.get()->setAttr(getVarNameAttrName(), varNameAttr);
+
+  Value allocResult = allocOp->getResult();
+
+  // Create a memref.cast operation to a flattened 10-element array
+  auto castedMemrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::CastOp> castOp =
+      memref::CastOp::create(b, loc, castedMemrefTy, allocResult);
+
+  Value castedPtr = castOp->getResult();
+
+  // Test that getVariableName walks through the cast to find the variable name
+  std::string varName = getVariableName(castedPtr);
+  EXPECT_EQ(varName, "casted_variable");
+}
+
+TEST_F(OpenACCUtilsTest, getVariableNameNotFound) {
+  // Create a memref without acc.var_name attribute
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+
+  Value varPtr = allocOp->getResult();
+
+  // Test that getVariableName returns empty string when no name is found
+  std::string varName = getVariableName(varPtr);
+  EXPECT_EQ(varName, "");
+}
+
+TEST_F(OpenACCUtilsTest, getVariableNameFromCopyin) {
+  // Create a memref
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+
+  Value varPtr = allocOp->getResult();
+  StringRef name = "data_array";
+  OwningOpRef<CopyinOp> copyinOp =
+      CopyinOp::create(b, loc, varPtr, /*structured=*/true, /*implicit=*/true,
+                       /*name=*/name);
+
+  // Test that getVariableName extracts the name from the copyin operation
+  std::string varName = getVariableName(copyinOp->getAccVar());
+  EXPECT_EQ(varName, name);
+}
diff --git a/offload/include/device.h b/offload/include/device.h
index bf93ce0..4e27943 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -158,6 +158,9 @@ struct DeviceTy {
   /// Ask the device whether the runtime should use auto zero-copy.
   bool useAutoZeroCopy();
 
+  /// Ask the device whether the storage is accessible.
+  bool isAccessiblePtr(const void *Ptr, size_t Size);
+
   /// Check if there are pending images for this device.
   bool hasPendingImages() const { return HasPendingImages; }
 
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 794b79e..89aa468 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -278,6 +278,7 @@ int omp_get_initial_device(void);
 void *omp_target_alloc(size_t Size, int DeviceNum);
 void omp_target_free(void *DevicePtr, int DeviceNum);
 int omp_target_is_present(const void *Ptr, int DeviceNum);
+int omp_target_is_accessible(const void *Ptr, size_t Size, int DeviceNum);
 int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                       size_t DstOffset, size_t SrcOffset, int DstDevice,
                       int SrcDevice);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index b0f0573..48b086d 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -196,6 +196,34 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
   return Rc;
 }
 
+/// Check whether a pointer is accessible from a device.
+/// Returns true when accessibility is guaranteed otherwise returns false.
+EXTERN int omp_target_is_accessible(const void *Ptr, size_t Size,
+                                    int DeviceNum) {
+  TIMESCOPE();
+  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+  DP("Call to omp_target_is_accessible for device %d, address " DPxMOD
+     ", size %zu\n",
+     DeviceNum, DPxPTR(Ptr), Size);
+
+  if (!Ptr) {
+    DP("Call to omp_target_is_accessible with NULL ptr returning false\n");
+    return false;
+  }
+
+  if (DeviceNum == omp_get_initial_device() || DeviceNum == -1) {
+    DP("Call to omp_target_is_accessible on host, returning true\n");
+    return true;
+  }
+
+  // The device number must refer to a valid device
+  auto DeviceOrErr = PM->getDevice(DeviceNum);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+  return DeviceOrErr->isAccessiblePtr(Ptr, Size);
+}
+
 EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                              size_t DstOffset, size_t SrcOffset, int DstDevice,
                              int SrcDevice) {
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index 71423ae..ee36fbe 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -367,3 +367,7 @@ bool DeviceTy::useAutoZeroCopy() {
     return false;
   return RTL->use_auto_zero_copy(RTLDeviceID);
 }
+
+bool DeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
+  return RTL->is_accessible_ptr(RTLDeviceID, Ptr, Size);
+}
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 1374bfe..910a5b6 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -43,6 +43,7 @@ VERS1.0 {
     omp_get_initial_device;
     omp_target_alloc;
     omp_target_free;
+    omp_target_is_accessible;
     omp_target_is_present;
     omp_target_memcpy;
     omp_target_memcpy_rect;
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 20d16fa..0b03ef5 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3062,6 +3062,30 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
   }
 
+  Expected<bool> isAccessiblePtrImpl(const void *Ptr, size_t Size) override {
+    hsa_amd_pointer_info_t Info;
+    Info.size = sizeof(hsa_amd_pointer_info_t);
+
+    hsa_agent_t *Agents = nullptr;
+    uint32_t Count = 0;
+    hsa_status_t Status =
+        hsa_amd_pointer_info(Ptr, &Info, malloc, &Count, &Agents);
+
+    if (auto Err = Plugin::check(Status, "error in hsa_amd_pointer_info: %s"))
+      return std::move(Err);
+
+    // Checks if the pointer is known by HSA and accessible by the device
+    for (uint32_t i = 0; i < Count; i++) {
+      if (Agents[i].handle == getAgent().handle)
+        return Info.sizeInBytes >= Size;
+    }
+
+    // If the pointer is unknown to HSA it's assumed a host pointer
+    // in that case the device can access it on unified memory support is
+    // enabled
+    return IsXnackEnabled;
+  }
+
   /// Getters and setters for stack and heap sizes.
   Error getDeviceStackSize(uint64_t &Value) override {
     Value = StackSize;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 8c530bb..f9bff9a 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1066,6 +1066,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   bool useAutoZeroCopy();
   virtual bool useAutoZeroCopyImpl() { return false; }
 
+  /// Returns true if the plugin can guarantee that the associated
+  /// storage is accessible
+  Expected<bool> isAccessiblePtr(const void *Ptr, size_t Size);
+
   virtual Expected<omp_interop_val_t *>
   createInterop(int32_t InteropType, interop_spec_t &InteropSpec) {
     return nullptr;
@@ -1166,6 +1170,10 @@ private:
   /// Per device setting of MemoryManager's Threshold
   virtual size_t getMemoryManagerSizeThreshold() { return 0; }
 
+  virtual Expected<bool> isAccessiblePtrImpl(const void *Ptr, size_t Size) {
+    return false;
+  }
+
   /// Environment variables defined by the OpenMP standard.
   Int32Envar OMP_TeamLimit;
   Int32Envar OMP_NumTeams;
@@ -1492,6 +1500,9 @@ public:
   /// Returns if the plugin can support automatic copy.
   int32_t use_auto_zero_copy(int32_t DeviceId);
 
+  /// Returns if the associated storage is accessible for a given device.
+  int32_t is_accessible_ptr(int32_t DeviceId, const void *Ptr, size_t Size);
+
   /// Look up a global symbol in the given binary.
   int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
                      const char *Name, void **DevicePtr);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index db43cbe..36d643b 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1599,6 +1599,10 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
 
 bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
 
+Expected<bool> GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
+  return isAccessiblePtrImpl(Ptr, Size);
+}
+
 Error GenericPluginTy::init() {
   if (Initialized)
     return Plugin::success();
@@ -2133,6 +2137,22 @@ int32_t GenericPluginTy::use_auto_zero_copy(int32_t DeviceId) {
   return getDevice(DeviceId).useAutoZeroCopy();
 }
 
+int32_t GenericPluginTy::is_accessible_ptr(int32_t DeviceId, const void *Ptr,
+                                           size_t Size) {
+  auto HandleError = [&](Error Err) -> bool {
+    [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+    DP("Failure while checking accessibility of pointer %p for device %d: %s",
+       Ptr, DeviceId, ErrStr.c_str());
+    return false;
+  };
+
+  auto AccessibleOrErr = getDevice(DeviceId).isAccessiblePtr(Ptr, Size);
+  if (Error Err = AccessibleOrErr.takeError())
+    return HandleError(std::move(Err));
+
+  return *AccessibleOrErr;
+}
+
 int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
                                     const char *Name, void **DevicePtr) {
   assert(Binary.handle && "Invalid device binary handle");
diff --git a/offload/test/mapping/is_accessible.cpp b/offload/test/mapping/is_accessible.cpp
new file mode 100644
index 0000000..7fb2389
--- /dev/null
+++ b/offload/test/mapping/is_accessible.cpp
@@ -0,0 +1,40 @@
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=1 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic
+
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=0 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=NO_USM
+
+// REQUIRES: unified_shared_memory
+// XFAIL: nvptx
+
+// CHECK: SUCCESS
+// NO_USM: Not accessible
+
+#include <assert.h>
+#include <iostream>
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int n = 10000;
+  int *a = new int[n];
+  int err = 0;
+
+  // program must be executed with HSA_XNACK=1
+  if (!omp_target_is_accessible(a, n * sizeof(int), /*device_num=*/0))
+    printf("Not accessible\n");
+  else {
+#pragma omp target teams distribute parallel for
+    for (int i = 0; i < n; i++)
+      a[i] = i;
+
+    for (int i = 0; i < n; i++)
+      if (a[i] != i)
+        err++;
+  }
+
+  printf("%s\n", err == 0 ? "SUCCESS" : "FAIL");
+  return err;
+}
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp
index 5e8769e..50a28e0 100644
--- a/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp
@@ -16,7 +16,7 @@ struct ST {
   int m = 0;
 
   void f6() {
-    uintptr_t offset = (uintptr_t)&d - n;
+    ptrdiff_t offset = (char *)&d - ((char *)(uintptr_t)n);
 #pragma omp target data map(to : m, d)
     {
       void *mapped_ptr = omp_get_mapped_ptr(&d, omp_get_default_device());
@@ -34,11 +34,15 @@ struct ST {
         // ref/attach modifiers:
         //  &ref_ptee(this[0].[d])), &ref_ptee(this[0].d), TO | FROM
         //  &ref_ptr(this[0].d), &ref_ptee(this[0].d), 4, ATTACH
-        // EXPECTED: 1 0
-        // CHECK:    0 1
-        printf("%d %d\n", &d == mapped_ptr,
-               (uintptr_t)&d == (uintptr_t)mapped_ptr - offset);
+        // EXPECTED:   1
+        // CHECK-NEXT: 0
+        printf("%d\n", &d == mapped_ptr);
+        ptrdiff_t offset_device = (char *)mapped_ptr - (char *)&d;
+        printf("offset = %td (%p), offset_device = %td (%p)\n", offset,
+               (void *)offset, offset_device, (void *)offset_device);
+        printf("mapped_ptr = %p, device_addr = %p, ", mapped_ptr, &d);
       }
+      printf("host_addr = %p\n", &d);
     }
   }
 };
diff --git a/offload/test/ompt/callbacks.h b/offload/test/ompt/callbacks.h
index 95437d9..2e7763f 100644
--- a/offload/test/ompt/callbacks.h
+++ b/offload/test/ompt/callbacks.h
@@ -5,6 +5,37 @@
 // Tool related code below
 #include <omp-tools.h>
 
+static const char *ompt_target_data_op_t_values[] = {
+    "",
+    "ompt_target_data_alloc",
+    "ompt_target_data_transfer_to_device",
+    "ompt_target_data_transfer_from_device",
+    "ompt_target_data_delete",
+    "ompt_target_data_associate",
+    "ompt_target_data_disassociate",
+    "ompt_target_data_alloc_async",
+    "ompt_target_data_transfer_to_device_async",
+    "ompt_target_data_transfer_from_device_async",
+    "ompt_target_data_delete_async"};
+
+static const char *ompt_scope_endpoint_t_values[] = {
+    "", "ompt_scope_begin", "ompt_scope_end", "ompt_scope_beginend"};
+
+static const char *ompt_target_t_values[] = {"",
+                                             "ompt_target",
+                                             "ompt_target_enter_data",
+                                             "ompt_target_exit_data",
+                                             "ompt_target_update",
+                                             "",
+                                             "",
+                                             "",
+                                             "",
+                                             "",
+                                             "ompt_target_nowait",
+                                             "ompt_target_enter_data_nowait",
+                                             "ompt_target_exit_data_nowait",
+                                             "ompt_target_update_nowait"};
+
 // For EMI callbacks
 ompt_id_t next_op_id = 0x8000000000000001;
 
@@ -38,11 +69,11 @@ static void on_ompt_callback_target_data_op(
     void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
     size_t bytes, const void *codeptr_ra) {
   assert(codeptr_ra != 0 && "Unexpected null codeptr");
-  printf("  Callback DataOp: target_id=%lu host_op_id=%lu optype=%d src=%p "
+  printf("  Callback DataOp: target_id=%lu host_op_id=%lu optype=%s src=%p "
          "src_device_num=%d "
          "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
-         target_id, host_op_id, optype, src_addr, src_device_num, dest_addr,
-         dest_device_num, bytes, codeptr_ra);
+         target_id, host_op_id, ompt_target_data_op_t_values[optype], src_addr,
+         src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
 }
 
 static void on_ompt_callback_target(ompt_target_t kind,
@@ -51,9 +82,10 @@ static void on_ompt_callback_target(ompt_target_t kind,
                                     ompt_id_t target_id,
                                     const void *codeptr_ra) {
   assert(codeptr_ra != 0 && "Unexpected null codeptr");
-  printf("Callback Target: target_id=%lu kind=%d endpoint=%d device_num=%d "
+  printf("Callback Target: target_id=%lu kind=%s endpoint=%s device_num=%d "
          "code=%p\n",
-         target_id, kind, endpoint, device_num, codeptr_ra);
+         target_id, ompt_target_t_values[kind],
+         ompt_scope_endpoint_t_values[endpoint], device_num, codeptr_ra);
 }
 
 static void on_ompt_callback_target_submit(ompt_id_t target_id,
@@ -84,13 +116,15 @@ static void on_ompt_callback_target_data_op_emi(
   // target_task_data may be null, avoid dereferencing it
   uint64_t target_task_data_value =
       (target_task_data) ? target_task_data->value : 0;
-  printf("  Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
+  printf("  Callback DataOp EMI: endpoint=%s optype=%s target_task_data=%p "
          "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
          "src_device_num=%d "
          "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
-         endpoint, optype, target_task_data, target_task_data_value,
-         target_data, target_data->value, host_op_id, *host_op_id, src_addr,
-         src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
+         ompt_scope_endpoint_t_values[endpoint],
+         ompt_target_data_op_t_values[optype], target_task_data,
+         target_task_data_value, target_data, target_data->value, host_op_id,
+         *host_op_id, src_addr, src_device_num, dest_addr, dest_device_num,
+         bytes, codeptr_ra);
 }
 
 static void on_ompt_callback_target_emi(ompt_target_t kind,
@@ -102,20 +136,21 @@ static void on_ompt_callback_target_emi(ompt_target_t kind,
   assert(codeptr_ra != 0 && "Unexpected null codeptr");
   if (endpoint == ompt_scope_begin)
     target_data->value = next_op_id++;
-  printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p "
+  printf("Callback Target EMI: kind=%s endpoint=%s device_num=%d task_data=%p "
          "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
-         kind, endpoint, device_num, task_data, task_data->value,
-         target_task_data, target_task_data->value, target_data,
-         target_data->value, codeptr_ra);
+         ompt_target_t_values[kind], ompt_scope_endpoint_t_values[endpoint],
+         device_num, task_data, task_data ? task_data->value : 0,
+         target_task_data, target_task_data ? target_task_data->value : 0,
+         target_data, target_data->value, codeptr_ra);
 }
 
 static void on_ompt_callback_target_submit_emi(
     ompt_scope_endpoint_t endpoint, ompt_data_t *target_data,
     ompt_id_t *host_op_id, unsigned int requested_num_teams) {
-  printf("  Callback Submit EMI: endpoint=%d  req_num_teams=%d target_data=%p "
+  printf("  Callback Submit EMI: endpoint=%s  req_num_teams=%d target_data=%p "
          "(0x%lx) host_op_id=%p (0x%lx)\n",
-         endpoint, requested_num_teams, target_data, target_data->value,
-         host_op_id, *host_op_id);
+         ompt_scope_endpoint_t_values[endpoint], requested_num_teams,
+         target_data, target_data->value, host_op_id, *host_op_id);
 }
 
 static void on_ompt_callback_target_map_emi(ompt_data_t *target_data,
diff --git a/offload/test/ompt/omp_api.c b/offload/test/ompt/omp_api.c
index a16ef7a..5fb2098 100644
--- a/offload/test/ompt/omp_api.c
+++ b/offload/test/ompt/omp_api.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 #include "omp.h"
 #include <stdlib.h>
@@ -32,8 +34,8 @@ int main(int argc, char **argv) {
 
 // clang-format off
 /// CHECK: Callback Init:
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=5
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=6
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_associate
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_disassociate
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/target_memcpy.c b/offload/test/ompt/target_memcpy.c
index f244e0f4..f769995 100644
--- a/offload/test/ompt/target_memcpy.c
+++ b/offload/test/ompt/target_memcpy.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Verify that for the target OpenMP APIs, the return address is non-null and
@@ -46,26 +48,26 @@ int main() {
 }
 
 // clang-format off
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
 /// CHECK-SAME: src_device_num=[[HOST:[0-9]+]]
 /// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]]
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE1:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK-SAME: src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE1]]
 /// CHECK: code=[[CODE2:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE2]]
 /// CHECK: code=[[CODE3:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE3]]
 /// CHECK: code=[[CODE4:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE4]]
diff --git a/offload/test/ompt/target_memcpy_emi.c b/offload/test/ompt/target_memcpy_emi.c
index 934caba..39f262a3 100644
--- a/offload/test/ompt/target_memcpy_emi.c
+++ b/offload/test/ompt/target_memcpy_emi.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Verify all three data transfer directions: H2D, D2D and D2H
@@ -54,28 +56,28 @@ int main(void) {
 /// CHECK: Callback Init:
 
 /// CHECK: Allocating Memory on Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK-SAME: src_device_num=[[HOST:[0-9]+]]
 /// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
 
 /// CHECK: Testing: Host to Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
 
 /// CHECK: Testing: Device to Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
 
 /// CHECK: Testing: Device to Host
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
 
 /// CHECK: Checking Correctness
 
 /// CHECK: Freeing Memory on Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 {{.+}} src_device_num=[[DEVICE]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 {{.+}} src_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete {{.+}} src_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete {{.+}} src_device_num=[[DEVICE]]
 
 /// CHECK: Callback Fini:
 
diff --git a/offload/test/ompt/veccopy.c b/offload/test/ompt/veccopy.c
index f28d94f..24d7363 100644
--- a/offload/test/ompt/veccopy.c
+++ b/offload/test/ompt/veccopy.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that registers non-EMI callbacks
@@ -54,48 +56,47 @@ int main() {
 // clang-format off
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 device_num=[[DEVICE_NUM:[0-9]+]]
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin device_num=[[DEVICE_NUM:[0-9]+]]
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE1:.*]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 device_num=[[DEVICE_NUM]] code=[[CODE1]]
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end device_num=[[DEVICE_NUM]] code=[[CODE1]]
 
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// device_num=[[DEVICE_NUM]]
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin device_num=[[DEVICE_NUM]]
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE2:.*]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 device_num=[[DEVICE_NUM]] code=[[CODE2]]
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end device_num=[[DEVICE_NUM]] code=[[CODE2]]
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_data.c b/offload/test/ompt/veccopy_data.c
index 059ca97..9df5374 100644
--- a/offload/test/ompt/veccopy_data.c
+++ b/offload/test/ompt/veccopy_data.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that registers EMI callbacks.
@@ -73,85 +75,86 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK-NOT: Callback Target EMI:
 /// CHECK-NOT: device_num=-1
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=2 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target_enter_data endpoint=ompt_scope_begin
 /// CHECK-NOT: device_num=-1
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE1:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target EMI: kind=2 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target_enter_data endpoint=ompt_scope_end
 /// CHECK-NOT: device_num=-1
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
 /// CHECK-NOT: device_num=-1
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE2:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Submit EMI: endpoint=1  req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2  req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin  req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end  req_num_teams=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK-NOT: device_num=-1
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target EMI: kind=3 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target_exit_data endpoint=ompt_scope_begin
 /// CHECK-NOT: device_num=-1
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE3:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE3]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE3]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE3]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE3]]
-/// CHECK: Callback Target EMI: kind=3 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target_exit_data endpoint=ompt_scope_end
 /// CHECK-NOT: device_num=-1
 /// CHECK: code=[[CODE3]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
 /// CHECK-NOT: device_num=-1
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE4:.*]]
-/// CHECK: Callback Submit EMI: endpoint=1  req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2  req_num_teams=1
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin  req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end  req_num_teams=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK-NOT: device_num=-1
 /// CHECK: code=[[CODE4]]
-/// CHECK: Callback Target EMI: kind=4 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target_update endpoint=ompt_scope_begin
 /// CHECK-NOT: device_num=-1
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE5:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE5]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE5]]
-/// CHECK: Callback Target EMI: kind=4 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target_update endpoint=ompt_scope_end
 /// CHECK-NOT: device_num=-1
 /// CHECK: code=[[CODE5]]
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_disallow_both.c b/offload/test/ompt/veccopy_disallow_both.c
index b531a62..bfc67c5 100644
--- a/offload/test/ompt/veccopy_disallow_both.c
+++ b/offload/test/ompt/veccopy_disallow_both.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that shows that both EMI and non-EMI
@@ -54,48 +56,49 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_emi.c b/offload/test/ompt/veccopy_emi.c
index 2c57a85..a1427b8 100644
--- a/offload/test/ompt/veccopy_emi.c
+++ b/offload/test/ompt/veccopy_emi.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that registers EMI callbacks
@@ -52,89 +54,90 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE1:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK: code=[[CODE1]]
 
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE2:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=0
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=0
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=0
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK: code=[[CODE2]]
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_emi_map.c b/offload/test/ompt/veccopy_emi_map.c
index fa18a43..450faa1 100644
--- a/offload/test/ompt/veccopy_emi_map.c
+++ b/offload/test/ompt/veccopy_emi_map.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that shows that map-EMI callbacks are not supported.
@@ -52,51 +54,52 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK: 0: Could not register callback 'ompt_callback_target_map_emi'
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=0
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=0
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=0
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_map.c b/offload/test/ompt/veccopy_map.c
index 2e817d3..12e141e 100644
--- a/offload/test/ompt/veccopy_map.c
+++ b/offload/test/ompt/veccopy_map.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that shows that map callbacks are not supported.
@@ -51,31 +53,31 @@ int main() {
   return rc;
 }
 
-
+// clang-format off
 /// CHECK: 0: Could not register callback 'ompt_callback_target_map'
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
-
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
+
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_no_device_init.c b/offload/test/ompt/veccopy_no_device_init.c
index 8ee8243..ade06fc 100644
--- a/offload/test/ompt/veccopy_no_device_init.c
+++ b/offload/test/ompt/veccopy_no_device_init.c
@@ -1,6 +1,7 @@
 // clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
+// clang-format on
 
 /*
  * Example OpenMP program that shows that if no device init callback
@@ -51,30 +52,31 @@ int main() {
 
   return rc;
 }
+
 // clang-format off
 /// CHECK-NOT: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 /// CHECK-NOT: Callback Fini:
diff --git a/offload/test/ompt/veccopy_wrong_return.c b/offload/test/ompt/veccopy_wrong_return.c
index 2d07b4e..17327f3 100644
--- a/offload/test/ompt/veccopy_wrong_return.c
+++ b/offload/test/ompt/veccopy_wrong_return.c
@@ -1,5 +1,7 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
+// clang-format on
 
 /*
  * Example OpenMP program that shows that if the initialize function
@@ -51,29 +53,30 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK-NOT: Callback Init:
 /// CHECK-NOT: Callback Load:
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK-NOT: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK-NOT: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 /// CHECK-NOT: Callback Fini
diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst
index f7c9689..f5ea47b 100644
--- a/polly/docs/ReleaseNotes.rst
+++ b/polly/docs/ReleaseNotes.rst
@@ -11,3 +11,5 @@ In Polly |version| the following important changes have been incorporated.
     the new features that have recently been committed to our development
     branch.
 
+ * ScopInliner has been updated for the New Pass Manager.
+
diff --git a/polly/docs/experiments/matmul/matmul.ll b/polly/docs/experiments/matmul/matmul.ll
index b5bc4b0..6c789edc 100644
--- a/polly/docs/experiments/matmul/matmul.ll
+++ b/polly/docs/experiments/matmul/matmul.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @init_array() #0 {
+define dso_local void @init_array() {
 entry:
   %i = alloca i32, align 4
   %j = alloca i32, align 4
@@ -88,7 +88,7 @@ for.end19:                                        ; preds = %for.cond
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() #0 {
+define dso_local void @print_array() {
 entry:
   %i = alloca i32, align 4
   %j = alloca i32, align 4
@@ -154,10 +154,10 @@ for.end12:                                        ; preds = %for.cond
   ret void
 }
 
-declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1
+declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...)
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local i32 @main() #0 {
+define dso_local i32 @main() {
 entry:
   %retval = alloca i32, align 4
   %i = alloca i32, align 4
@@ -261,9 +261,6 @@ for.end30:                                        ; preds = %for.cond
   ret i32 0
 }
 
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/polly/docs/experiments/matmul/matmul.normalopt.ll b/polly/docs/experiments/matmul/matmul.normalopt.ll
index 1625dc7..a2c28b8 100644
--- a/polly/docs/experiments/matmul/matmul.normalopt.ll
+++ b/polly/docs/experiments/matmul/matmul.normalopt.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: noinline norecurse nounwind uwtable writeonly
-define dso_local void @init_array() local_unnamed_addr #0 {
+define dso_local void @init_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -60,7 +60,7 @@ for.end19:                                        ; preds = %for.inc17
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() local_unnamed_addr #1 {
+define dso_local void @print_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -75,7 +75,7 @@ for.body3:                                        ; preds = %for.inc, %for.cond1
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -103,10 +103,10 @@ for.end12:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind
-declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr
 
 ; Function Attrs: noinline norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #3 {
+define dso_local i32 @main() local_unnamed_addr {
 entry:
   tail call void @init_array()
   br label %for.cond1.preheader
@@ -164,13 +164,7 @@ for.end30:                                        ; preds = %for.inc28
 }
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4
-
-attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll
index 3216d79..3be4393 100644
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll
@@ -13,18 +13,18 @@ target triple = "x86_64-unknown-linux-gnu"
 @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @init_array() local_unnamed_addr #0 {
+define dso_local void @init_array() local_unnamed_addr {
 entry:
   %polly.par.userContext = alloca {}, align 8
   %polly.par.userContext1 = bitcast {}* %polly.par.userContext to i8*
-  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @init_array_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1) #3
-  call void @init_array_polly_subfn(i8* nonnull %polly.par.userContext1) #3
-  call void @GOMP_parallel_end() #3
+  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @init_array_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1)
+  call void @init_array_polly_subfn(i8* nonnull %polly.par.userContext1)
+  call void @GOMP_parallel_end()
   ret void
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() local_unnamed_addr #1 {
+define dso_local void @print_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -39,7 +39,7 @@ for.body3:                                        ; preds = %for.inc, %for.cond1
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #3
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -67,27 +67,27 @@ for.end12:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind
-declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #0 {
+define dso_local i32 @main() local_unnamed_addr {
 entry:
   %polly.par.userContext3 = alloca {}, align 8
   tail call void @init_array()
   %polly.par.userContext1 = bitcast {}* %polly.par.userContext3 to i8*
-  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1) #3
-  call void @main_polly_subfn(i8* nonnull %polly.par.userContext1) #3
-  call void @GOMP_parallel_end() #3
-  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn_1, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 64) #3
-  call void @main_polly_subfn_1(i8* nonnull %polly.par.userContext1) #3
-  call void @GOMP_parallel_end() #3
+  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1)
+  call void @main_polly_subfn(i8* nonnull %polly.par.userContext1)
+  call void @GOMP_parallel_end()
+  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn_1, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 64)
+  call void @main_polly_subfn_1(i8* nonnull %polly.par.userContext1)
+  call void @GOMP_parallel_end()
   ret i32 0
 }
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #3
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr
 
-define internal void @init_array_polly_subfn(i8* nocapture readnone %polly.par.userContext) #4 {
+define internal void @init_array_polly_subfn(i8* nocapture readnone %polly.par.userContext) {
 polly.par.setup:
   %polly.par.LBPtr = alloca i64, align 8
   %polly.par.UBPtr = alloca i64, align 8
@@ -146,7 +146,7 @@ declare void @GOMP_parallel_loop_runtime_start(void (i8*)*, i8*, i32, i64, i64,
 
 declare void @GOMP_parallel_end() local_unnamed_addr
 
-define internal void @main_polly_subfn(i8* nocapture readnone %polly.par.userContext) #4 {
+define internal void @main_polly_subfn(i8* nocapture readnone %polly.par.userContext) {
 polly.par.setup:
   %polly.par.LBPtr = alloca i64, align 8
   %polly.par.UBPtr = alloca i64, align 8
@@ -175,7 +175,7 @@ polly.par.loadIVBounds:                           ; preds = %polly.par.setup, %p
   br i1 %7, label %polly.par.exit, label %polly.par.loadIVBounds
 }
 
-define internal void @main_polly_subfn_1(i8* nocapture readnone %polly.par.userContext) #4 {
+define internal void @main_polly_subfn_1(i8* nocapture readnone %polly.par.userContext) {
 polly.par.setup:
   %polly.par.LBPtr = alloca i64, align 8
   %polly.par.UBPtr = alloca i64, align 8
@@ -363,14 +363,7 @@ vector.ph:                                        ; preds = %polly.loop_header14
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-attributes #4 = { "polly.skip.fn" }
-attributes #5 = { argmemonly nounwind }
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll
index 4afccd5..60228a45 100644
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: noinline norecurse nounwind uwtable writeonly
-define dso_local void @init_array() local_unnamed_addr #0 {
+define dso_local void @init_array() local_unnamed_addr {
 entry:
   br label %polly.loop_header
 
@@ -61,7 +61,7 @@ polly.loop_header1:                               ; preds = %polly.loop_header1,
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() local_unnamed_addr #1 {
+define dso_local void @print_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -76,7 +76,7 @@ for.body3:                                        ; preds = %for.inc, %for.cond1
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -104,10 +104,10 @@ for.end12:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind
-declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr
 
 ; Function Attrs: noinline norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #3 {
+define dso_local i32 @main() local_unnamed_addr {
 entry:
   tail call void @init_array()
   call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([1536 x [1536 x float]]* @C to i8*), i8 0, i64 9437184, i1 false)
@@ -282,17 +282,10 @@ vector.ph:                                        ; preds = %polly.loop_header26
 }
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5
-
-attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-attributes #5 = { argmemonly nounwind }
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll
index 0641b55..d2786e0 100644
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: noinline norecurse nounwind uwtable writeonly
-define dso_local void @init_array() local_unnamed_addr #0 {
+define dso_local void @init_array() local_unnamed_addr {
 entry:
   br label %polly.loop_header
 
@@ -61,7 +61,7 @@ polly.loop_header1:                               ; preds = %polly.loop_header1,
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() local_unnamed_addr #1 {
+define dso_local void @print_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -76,7 +76,7 @@ for.body3:                                        ; preds = %for.inc, %for.cond1
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -104,10 +104,10 @@ for.end12:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind
-declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr
 
 ; Function Attrs: noinline norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #3 {
+define dso_local i32 @main() local_unnamed_addr {
 entry:
   tail call void @init_array()
   call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([1536 x [1536 x float]]* @C to i8*), i8 0, i64 9437184, i1 false)
@@ -343,17 +343,10 @@ vector.ph:                                        ; preds = %polly.loop_header26
 }
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5
-
-attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-attributes #5 = { argmemonly nounwind }
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll
index bfc74f2..5854cad 100644
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: noinline norecurse nounwind uwtable writeonly
-define dso_local void @init_array() local_unnamed_addr #0 {
+define dso_local void @init_array() local_unnamed_addr {
 entry:
   br label %polly.loop_header
 
@@ -61,7 +61,7 @@ polly.loop_header1:                               ; preds = %polly.loop_header1,
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() local_unnamed_addr #1 {
+define dso_local void @print_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -76,7 +76,7 @@ for.body3:                                        ; preds = %for.inc, %for.cond1
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -104,10 +104,10 @@ for.end12:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind
-declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr
 
 ; Function Attrs: noinline norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #3 {
+define dso_local i32 @main() local_unnamed_addr {
 entry:
   tail call void @init_array()
   call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([1536 x [1536 x float]]* @C to i8*), i8 0, i64 9437184, i1 false)
@@ -189,17 +189,10 @@ polly.loop_exit22:                                ; preds = %vector.body
 }
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5
-
-attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-attributes #5 = { argmemonly nounwind }
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/docs/experiments/matmul/matmul.preopt.ll b/polly/docs/experiments/matmul/matmul.preopt.ll
index 3b9446f..e55be8ba 100644
--- a/polly/docs/experiments/matmul/matmul.preopt.ll
+++ b/polly/docs/experiments/matmul/matmul.preopt.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @init_array() #0 {
+define dso_local void @init_array() {
 entry:
   br label %entry.split
 
@@ -52,7 +52,7 @@ for.end19:                                        ; preds = %for.inc17
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() #0 {
+define dso_local void @print_array() {
 entry:
   br label %entry.split
 
@@ -70,7 +70,7 @@ for.body3:                                        ; preds = %for.cond1.preheader
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -98,10 +98,10 @@ for.end12:                                        ; preds = %for.end
   ret void
 }
 
-declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1
+declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...)
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local i32 @main() #0 {
+define dso_local i32 @main() {
 entry:
   br label %entry.split
 
@@ -148,14 +148,10 @@ for.end30:                                        ; preds = %for.inc28
 }
 
 ; Function Attrs: nounwind
-declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #2
+declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture)
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) #2
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h
index c3b68a7..9978344c 100644
--- a/polly/include/polly/LinkAllPasses.h
+++ b/polly/include/polly/LinkAllPasses.h
@@ -119,7 +119,7 @@ struct PollyForcePassLinking {
 
 namespace llvm {
 void initializeCodePreparationPass(llvm::PassRegistry &);
-void initializeScopInlinerPass(llvm::PassRegistry &);
+void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &);
 void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &);
 void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &);
 void initializeScopInfoRegionPassPass(PassRegistry &);
diff --git a/polly/include/polly/ScopInliner.h b/polly/include/polly/ScopInliner.h
new file mode 100644
index 0000000..0146678
--- /dev/null
+++ b/polly/include/polly/ScopInliner.h
@@ -0,0 +1,34 @@
+//===------ ScopInliner.h ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_POLLYINLINER_H
+#define POLLY_POLLYINLINER_H
+
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/IR/PassManager.h"
+
+namespace polly {
+class ScopInlinerPass : public llvm::PassInfoMixin<ScopInlinerPass> {
+public:
+  ScopInlinerPass();
+
+  llvm::PreservedAnalyses run(llvm::LazyCallGraph::SCC &C,
+                              llvm::CGSCCAnalysisManager &AM,
+                              llvm::LazyCallGraph &CG,
+                              llvm::CGSCCUpdateResult &UR);
+};
+
+llvm::Pass *createScopInlinerWrapperPass();
+} // namespace polly
+
+namespace llvm {
+void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &);
+}
+
+#endif /* POLLY_POLLYINLINER_H */
diff --git a/polly/lib/Support/PollyPasses.def b/polly/lib/Support/PollyPasses.def
index e068f31..2c792a5 100644
--- a/polly/lib/Support/PollyPasses.def
+++ b/polly/lib/Support/PollyPasses.def
@@ -1,3 +1,9 @@
+#ifndef CGSCC_PASS
+#define CGSCC_PASS(NAME, CREATE_PASS, PARSER)
+#endif
+CGSCC_PASS("polly-inline", ScopInlinerPass(), parseNoOptions)
+#undef CGSCC_PASS
+
 #ifndef FUNCTION_ANALYSIS
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)
 #endif
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index 0420dff..04f8715 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -35,6 +35,7 @@
 #include "polly/ScopDetection.h"
 #include "polly/ScopGraphPrinter.h"
 #include "polly/ScopInfo.h"
+#include "polly/ScopInliner.h"
 #include "polly/Simplify.h"
 #include "polly/Support/DumpFunctionPass.h"
 #include "polly/Support/DumpModulePass.h"
@@ -46,10 +47,13 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/IPO.h"
 
+using namespace llvm;
 namespace cl = llvm::cl;
+using namespace polly;
 
 using llvm::FunctionPassManager;
 using llvm::OptimizationLevel;
@@ -233,7 +237,7 @@ void initializePollyPasses(llvm::PassRegistry &Registry) {
   initializePollyCanonicalizePass(Registry);
   initializeScopDetectionWrapperPassPass(Registry);
   initializeScopDetectionPrinterLegacyPassPass(Registry);
-  initializeScopInlinerPass(Registry);
+  initializeScopInlinerWrapperPassPass(Registry);
   initializeScopInfoRegionPassPass(Registry);
   initializeScopInfoPrinterLegacyRegionPassPass(Registry);
   initializeScopInfoWrapperPassPass(Registry);
@@ -434,6 +438,16 @@ static void buildLatePollyPipeline(FunctionPassManager &PM,
         false);
 }
 
+static llvm::Expected<std::monostate> parseNoOptions(StringRef Params) {
+  if (!Params.empty())
+    return make_error<StringError>(
+        formatv("'{0}' passed to pass that does not take any options", Params)
+            .str(),
+        inconvertibleErrorCode());
+
+  return std::monostate{};
+}
+
 static OwningScopAnalysisManagerFunctionProxy
 createScopAnalyses(FunctionAnalysisManager &FAM,
                    PassInstrumentationCallbacks *PIC) {
@@ -461,6 +475,23 @@ static void registerFunctionAnalyses(FunctionAnalysisManager &FAM,
   FAM.registerPass([&FAM, PIC] { return createScopAnalyses(FAM, PIC); });
 }
 
+static llvm::Expected<bool>
+parseCGPipeline(StringRef Name, llvm::CGSCCPassManager &CGPM,
+                PassInstrumentationCallbacks *PIC,
+                ArrayRef<PassBuilder::PipelineElement> Pipeline) {
+#define CGSCC_PASS(NAME, CREATE_PASS, PARSER)                                  \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME)) {                    \
+    auto Params = PassBuilder::parsePassParameters(PARSER, Name, NAME);        \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    CGPM.addPass(CREATE_PASS);                                                 \
+    return true;                                                               \
+  }
+#include "PollyPasses.def"
+
+  return false;
+}
+
 static bool
 parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM,
                       ArrayRef<PassBuilder::PipelineElement> Pipeline) {
@@ -598,6 +629,12 @@ void registerPollyPasses(PassBuilder &PB) {
             ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
         return parseScopPipeline(Name, FPM, PIC, Pipeline);
       });
+  PB.registerPipelineParsingCallback(
+      [PIC](StringRef Name, CGSCCPassManager &CGPM,
+            ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
+        ExitOnError Err("Unable to parse Polly call graph pass: ");
+        return Err(parseCGPipeline(Name, CGPM, PIC, Pipeline));
+      });
   PB.registerParseTopLevelPipelineCallback(
       [PIC](llvm::ModulePassManager &MPM,
             ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
diff --git a/polly/lib/Transform/ScopInliner.cpp b/polly/lib/Transform/ScopInliner.cpp
index b78206c..c04ba34 100644
--- a/polly/lib/Transform/ScopInliner.cpp
+++ b/polly/lib/Transform/ScopInliner.cpp
@@ -13,10 +13,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "polly/LinkAllPasses.h"
+#include "polly/ScopInliner.h"
 #include "polly/ScopDetection.h"
+#include "polly/ScopInliner.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
@@ -28,13 +32,77 @@ using namespace llvm;
 using namespace polly;
 
 namespace {
-class ScopInliner final : public CallGraphSCCPass {
+
+/// Inliner implementation that works with both, LPM (using SCC_t=CallGraph) and
+/// NPM (using SCC_t=LazyCallGraph::SCC)
+template <typename SCC_t> bool runScopInlinerImpl(Function *F, SCC_t &SCC) {
+  // We do not try to inline non-trivial SCCs because this would lead to
+  // "infinite" inlining if we are not careful.
+  if (SCC.size() > 1)
+    return false;
+  assert(SCC.size() == 1 && "found empty SCC");
+
+  // If the function is a nullptr, or the function is a declaration.
+  if (!F)
+    return false;
+  if (F->isDeclaration()) {
+    POLLY_DEBUG(dbgs() << "Skipping " << F->getName()
+                       << "because it is a declaration.\n");
+    return false;
+  }
+
+  PassBuilder PB;
+  // Populate analysis managers and register Polly-specific analyses.
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(*F);
+  auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(*F);
+  auto &LI = FAM.getResult<LoopAnalysis>(*F);
+  auto &RI = FAM.getResult<RegionInfoAnalysis>(*F);
+  auto &AA = FAM.getResult<AAManager>(*F);
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+  ScopDetection SD(DT, SE, LI, RI, AA, ORE);
+  SD.detect(*F);
+
+  const bool HasScopAsTopLevelRegion =
+      SD.ValidRegions.contains(RI.getTopLevelRegion());
+
+  bool Changed = false;
+  if (HasScopAsTopLevelRegion) {
+    POLLY_DEBUG(dbgs() << "Skipping " << F->getName()
+                       << " has scop as top level region");
+    F->addFnAttr(llvm::Attribute::AlwaysInline);
+
+    ModulePassManager MPM;
+    MPM.addPass(AlwaysInlinerPass());
+    Module *M = F->getParent();
+    assert(M && "Function has illegal module");
+    PreservedAnalyses PA = MPM.run(*M, MAM);
+    if (!PA.areAllPreserved())
+      Changed = true;
+  } else {
+    POLLY_DEBUG(dbgs() << F->getName()
+                       << " does NOT have scop as top level region\n");
+  }
+
+  return Changed;
+}
+
+class ScopInlinerWrapperPass final : public CallGraphSCCPass {
   using llvm::Pass::doInitialization;
 
 public:
   static char ID;
 
-  ScopInliner() : CallGraphSCCPass(ID) {}
+  ScopInlinerWrapperPass() : CallGraphSCCPass(ID) {}
 
   bool doInitialization(CallGraph &CG) override {
     if (!polly::PollyAllowFullFunction) {
@@ -50,60 +118,8 @@ public:
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override {
-    // We do not try to inline non-trivial SCCs because this would lead to
-    // "infinite" inlining if we are not careful.
-    if (SCC.size() > 1)
-      return false;
-    assert(SCC.size() == 1 && "found empty SCC");
     Function *F = (*SCC.begin())->getFunction();
-
-    // If the function is a nullptr, or the function is a declaration.
-    if (!F)
-      return false;
-    if (F->isDeclaration()) {
-      POLLY_DEBUG(dbgs() << "Skipping " << F->getName()
-                         << "because it is a declaration.\n");
-      return false;
-    }
-
-    PassBuilder PB;
-    // Populate analysis managers and register Polly-specific analyses.
-    LoopAnalysisManager LAM;
-    FunctionAnalysisManager FAM;
-    CGSCCAnalysisManager CGAM;
-    ModuleAnalysisManager MAM;
-    FAM.registerPass([] { return ScopAnalysis(); });
-    PB.registerModuleAnalyses(MAM);
-    PB.registerCGSCCAnalyses(CGAM);
-    PB.registerFunctionAnalyses(FAM);
-    PB.registerLoopAnalyses(LAM);
-    PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
-
-    RegionInfo &RI = FAM.getResult<RegionInfoAnalysis>(*F);
-    ScopDetection &SD = FAM.getResult<ScopAnalysis>(*F);
-
-    const bool HasScopAsTopLevelRegion =
-        SD.ValidRegions.contains(RI.getTopLevelRegion());
-
-    bool Changed = false;
-    if (HasScopAsTopLevelRegion) {
-      POLLY_DEBUG(dbgs() << "Skipping " << F->getName()
-                         << " has scop as top level region");
-      F->addFnAttr(llvm::Attribute::AlwaysInline);
-
-      ModulePassManager MPM;
-      MPM.addPass(AlwaysInlinerPass());
-      Module *M = F->getParent();
-      assert(M && "Function has illegal module");
-      PreservedAnalyses PA = MPM.run(*M, MAM);
-      if (!PA.areAllPreserved())
-        Changed = true;
-    } else {
-      POLLY_DEBUG(dbgs() << F->getName()
-                         << " does NOT have scop as top level region\n");
-    }
-
-    return Changed;
+    return runScopInlinerImpl(F, SCC);
   };
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -111,18 +127,39 @@ public:
   }
 };
 } // namespace
-char ScopInliner::ID;
+char ScopInlinerWrapperPass::ID;
 
-Pass *polly::createScopInlinerPass() {
-  ScopInliner *pass = new ScopInliner();
+Pass *polly::createScopInlinerWrapperPass() {
+  ScopInlinerWrapperPass *pass = new ScopInlinerWrapperPass();
   return pass;
 }
 
 INITIALIZE_PASS_BEGIN(
-    ScopInliner, "polly-scop-inliner",
+    ScopInlinerWrapperPass, "polly-scop-inliner",
     "inline functions based on how much of the function is a scop.", false,
     false)
 INITIALIZE_PASS_END(
-    ScopInliner, "polly-scop-inliner",
+    ScopInlinerWrapperPass, "polly-scop-inliner",
     "inline functions based on how much of the function is a scop.", false,
     false)
+
+polly::ScopInlinerPass::ScopInlinerPass() {
+  if (!polly::PollyAllowFullFunction) {
+    report_fatal_error(
+        "Aborting from ScopInliner because it only makes sense to run with "
+        "-polly-allow-full-function. "
+        "The heurtistic for ScopInliner checks that the full function is a "
+        "Scop, which happens if and only if polly-allow-full-function is "
+        " enabled. "
+        " If not, the entry block is not included in the Scop");
+  }
+}
+
+PreservedAnalyses polly::ScopInlinerPass::run(llvm::LazyCallGraph::SCC &SCC,
+                                              llvm::CGSCCAnalysisManager &AM,
+                                              llvm::LazyCallGraph &CG,
+                                              llvm::CGSCCUpdateResult &UR) {
+  Function *F = &SCC.begin()->getFunction();
+  bool Changed = runScopInlinerImpl(F, SCC);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
index ac78b4e..4deab1a 100644
--- a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
+++ b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
@@ -5,7 +5,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @main() local_unnamed_addr #0 {
+define void @main() local_unnamed_addr {
 entry:
   %0 = load ptr, ptr undef, align 8, !tbaa !1
   %1 = load ptr, ptr undef, align 8, !tbaa !1
@@ -35,8 +35,6 @@ kernel_gemver_StrictFP.exit:                      ; preds = %for.inc85.i238
   ret void
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 6.0.0 "}
@@ -47,7 +45,6 @@ attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="fals
 !5 = !{!6, !6, i64 0}
 !6 = !{!"double", !3, i64 0}
 
-
 ; CHECK-LABEL: define internal void @main_polly_subfn(ptr %polly.par.userContext)
 ;
 ; CHECK:       polly.stmt.for.body65.i226:
diff --git a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
index 96dc425..c207f58 100644
--- a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
+++ b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
@@ -23,7 +23,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @longLimit = external global [9 x [23 x i32]], align 16
 @shortLimit = external global [9 x [14 x i32]], align 16
 
-define void @init_layer3(i32 %down_sample_sblimit) #0 {
+define void @init_layer3(i32 %down_sample_sblimit) {
 entry:
   br label %for.cond.463.preheader
 
@@ -63,8 +63,6 @@ for.inc.530:                                      ; preds = %for.inc.527
   br i1 %exitcond142, label %for.cond.499.preheader, label %for.cond.533.preheader
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 246359)"}
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
index 6ffe6bf..dfef420 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
@@ -35,7 +35,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @global4 = external global ptr, align 8
 
 ; Function Attrs: uwtable
-define i32 @foo(ptr %arg) #0 personality ptr @blam {
+define i32 @foo(ptr %arg) personality ptr @blam {
 bb:
   br label %bb3
 
@@ -83,40 +83,33 @@ bb19:                                             ; preds = %bb19, %bb14
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #2
+declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
 
 ; Function Attrs: nobuiltin
-declare noalias ptr @eggs(i64) #3
+declare noalias ptr @eggs(i64)
 
 ; Function Attrs: nobuiltin
-declare noalias ptr @bar(i64) #3
+declare noalias ptr @bar(i64)
 
 ; Function Attrs: uwtable
-declare void @zot(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr #0 align 2
+declare void @zot(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr align 2
 
 declare i32 @blam(...)
 
 ; Function Attrs: nobuiltin nounwind
-declare void @zot5(ptr) #4
+declare void @zot5(ptr)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end(i64, ptr nocapture)
 
 ; Function Attrs: uwtable
-declare i32 @eggs6(ptr) #0
+declare i32 @eggs6(ptr)
 
 ; Function Attrs: nounwind uwtable
-declare void @eggs7(ptr, i32, i32, i32) unnamed_addr #5 align 2
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nobuiltin "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nobuiltin nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @eggs7(ptr, i32, i32, i32) unnamed_addr align 2
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
index 68c247a..fcc6764 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @REGISTER = external global [10 x i32], align 16
 
 ; Function Attrs: nounwind uwtable
-define void @FORMAT3_4() #0 {
+define void @FORMAT3_4() {
 entry:
   %INSTR = alloca [32 x i32], align 16
   br label %entry.split
@@ -20,7 +20,7 @@ entry.split:                                      ; preds = %entry
   br i1 %cmp, label %if.end.36, label %if.else
 
 if.else:                                          ; preds = %entry.split
-  call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef) #2
+  call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef)
   %1 = load i32, ptr undef, align 4
   %cmp14 = icmp eq i32 %1, 1
   br i1 %cmp14, label %land.lhs.true, label %if.end.36
@@ -179,8 +179,4 @@ return:                                           ; preds = %if.then.219, %if.th
   ret void
 }
 
-declare void @BYTES_TO_BITS(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare void @BYTES_TO_BITS(...)
diff --git a/polly/test/CodeGen/debug-intrinsics.ll b/polly/test/CodeGen/debug-intrinsics.ll
index 65fa678..ed4b81a 100644
--- a/polly/test/CodeGen/debug-intrinsics.ll
+++ b/polly/test/CodeGen/debug-intrinsics.ll
@@ -9,7 +9,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @foo(ptr %A, i64 %N) #0 !dbg !4 {
+define void @foo(ptr %A, i64 %N) !dbg !4 {
 entry:
   br label %entry.split
 
@@ -49,13 +49,10 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; CHECK-NOT: #dbg_value
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12}
diff --git a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
index 008e16c..abec288 100644
--- a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
+++ b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
@@ -17,10 +17,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.timeb.11.32.53.242.347.557.599.851.998.1208.2069.2153.2174.2237.2258.2279.2321 = type { i64, i16, i16, i16 }
 %struct.pix_pos.13.34.55.244.349.559.601.853.1000.1210.2071.2155.2176.2239.2260.2281.2323 = type { i32, i32, i32, i32, i32, i32 }
 
-declare void @getLuma4x4Neighbour() #0
+declare void @getLuma4x4Neighbour()
 
 ; Function Attrs: nounwind uwtable
-define void @readCBP_CABAC(ptr %img) #1 {
+define void @readCBP_CABAC(ptr %img) {
 entry:
   %block_a = alloca %struct.pix_pos.13.34.55.244.349.559.601.853.1000.1210.2071.2155.2176.2239.2260.2281.2323, align 4
   %mb_data = getelementptr inbounds %struct.img_par.12.33.54.243.348.558.600.852.999.1209.2070.2154.2175.2238.2259.2280.2322, ptr %img, i64 0, i32 39
@@ -45,7 +45,7 @@ if.end.35:                                        ; preds = %if.else.19, %if.els
   br i1 %cmp36, label %if.then.38, label %if.else.66
 
 if.then.38:                                       ; preds = %if.end.35
-  call void @getLuma4x4Neighbour() #2
+  call void @getLuma4x4Neighbour()
   %0 = load i32, ptr null, align 4
   %tobool = icmp eq i32 %0, 0
   br i1 %tobool, label %if.end.72, label %if.then.42
@@ -72,7 +72,3 @@ for.inc.84:                                       ; preds = %if.end.72
 for.end.86:                                       ; preds = %for.inc.84
   ret void
 }
-
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
diff --git a/polly/test/CodeGen/hoisting_1.ll b/polly/test/CodeGen/hoisting_1.ll
index 1f065be..aa29bfd 100644
--- a/polly/test/CodeGen/hoisting_1.ll
+++ b/polly/test/CodeGen/hoisting_1.ll
@@ -26,7 +26,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.bar.11 = type { ptr, ptr, ptr }
 
 ; Function Attrs: nounwind uwtable
-define void @foo(ptr %arg) #0 {
+define void @foo(ptr %arg) {
 bb:
   br label %bb2
 
@@ -49,8 +49,6 @@ bb10:                                             ; preds = %bb9
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259869)"}
diff --git a/polly/test/CodeGen/hoisting_2.ll b/polly/test/CodeGen/hoisting_2.ll
index e76ee06..1b913f2 100644
--- a/polly/test/CodeGen/hoisting_2.ll
+++ b/polly/test/CodeGen/hoisting_2.ll
@@ -27,7 +27,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.foo.12 = type { %struct.foo.4, i32, i32, i32, i32 }
 
 ; Function Attrs: nounwind uwtable
-define void @eggs(ptr %arg) #0 {
+define void @eggs(ptr %arg) {
 bb:
   %tmp = load ptr, ptr undef, align 8, !tbaa !1
   br label %bb5
@@ -62,8 +62,6 @@ bb22:                                             ; preds = %bb13.bb22_crit_edge
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259869)"}
diff --git a/polly/test/CodeGen/intrinsics_lifetime.ll b/polly/test/CodeGen/intrinsics_lifetime.ll
index 6dca218..a708548 100644
--- a/polly/test/CodeGen/intrinsics_lifetime.ll
+++ b/polly/test/CodeGen/intrinsics_lifetime.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @A = common global [1024 x i32] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @jd() #0 {
+define void @jd() {
 entry:
   %tmp = alloca [1024 x i32], align 16
   br label %for.cond
@@ -32,7 +32,7 @@ for.cond:                                         ; preds = %for.inc11, %entry
   br i1 %exitcond5, label %for.body, label %for.end13
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start(i64 4096, ptr %tmp) #1
+  call void @llvm.lifetime.start(i64 4096, ptr %tmp)
   br label %for.cond2
 
 for.cond2:                                        ; preds = %for.inc, %for.body
@@ -59,7 +59,7 @@ for.end:                                          ; preds = %for.cond2
   %tmp8 = load i32, ptr %arrayidx8, align 4
   %arrayidx10 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %indvars.iv3
   store i32 %tmp8, ptr %arrayidx10, align 4
-  call void @llvm.lifetime.end(i64 4096, ptr %tmp) #1
+  call void @llvm.lifetime.end(i64 4096, ptr %tmp)
   br label %for.inc11
 
 for.inc11:                                        ; preds = %for.end
@@ -71,10 +71,7 @@ for.end13:                                        ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
+declare void @llvm.lifetime.end(i64, ptr nocapture)
diff --git a/polly/test/CodeGen/intrinsics_misc.ll b/polly/test/CodeGen/intrinsics_misc.ll
index 8416489..a643b8a 100644
--- a/polly/test/CodeGen/intrinsics_misc.ll
+++ b/polly/test/CodeGen/intrinsics_misc.ll
@@ -28,7 +28,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @A = common global [1024 x i32] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @jd() #0 {
+define void @jd() {
 entry:
   %tmp = alloca [1024 x i32], align 16
   br label %for.cond
@@ -39,7 +39,7 @@ for.cond:                                         ; preds = %for.inc11, %entry
   br i1 %exitcond5, label %for.body, label %for.end13
 
 for.body:                                         ; preds = %for.cond
-  %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A) #1
+  %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A)
   br label %for.cond2
 
 for.cond2:                                        ; preds = %for.inc, %for.body
@@ -68,7 +68,7 @@ for.end:                                          ; preds = %for.cond2
   %arrayidx8 = getelementptr inbounds [1024 x i32], ptr %tmp, i64 0, i64 %indvars.iv3
   %tmp8 = load i32, ptr %arrayidx8, align 4
   %arrayidx10 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %indvars.iv3
-  call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A) #1
+  call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A)
   store i32 %tmp8, ptr %arrayidx10, align 4
   br label %for.inc11
 
@@ -81,19 +81,16 @@ for.end13:                                        ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.donothing() #1
+declare void @llvm.donothing()
 
 ; Function Attrs: nounwind
-declare void @llvm.assume(i1) #1
+declare void @llvm.assume(i1)
 
 ; Function Attrs: nounwind
-declare i1 @llvm.expect.i1(i1, i1) #1
+declare i1 @llvm.expect.i1(i1, i1)
 
 ; Function Attrs: nounwind
-declare ptr @llvm.invariant.start(i64, ptr nocapture) #1
+declare ptr @llvm.invariant.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.invariant.end(ptr, i64, ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
+declare void @llvm.invariant.end(ptr, i64, ptr nocapture)
diff --git a/polly/test/CodeGen/invariant_cannot_handle_void.ll b/polly/test/CodeGen/invariant_cannot_handle_void.ll
index 0859a4e..420cb60 100644
--- a/polly/test/CodeGen/invariant_cannot_handle_void.ll
+++ b/polly/test/CodeGen/invariant_cannot_handle_void.ll
@@ -24,7 +24,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @sudecrypt(ptr %buff) #0 {
+define void @sudecrypt(ptr %buff) {
 entry:
   br i1 undef, label %cleanup, label %if.end
 
@@ -62,8 +62,6 @@ cleanup:                                          ; preds = %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/CodeGen/invariant_load_different_sized_types.ll b/polly/test/CodeGen/invariant_load_different_sized_types.ll
index 2995bce..0a88bb7 100644
--- a/polly/test/CodeGen/invariant_load_different_sized_types.ll
+++ b/polly/test/CodeGen/invariant_load_different_sized_types.ll
@@ -8,11 +8,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK-NEXT:   %polly.access.tmp2.load = load i32, ptr %polly.access.tmp2, align 1
 ; CHECK-NEXT:   store i32 %polly.access.tmp2.load, ptr %tmp.preload.s2a
 
-
 %struct.hoge = type { [4 x i8], i32, i32, i32, i32, i32, [16 x i8], [16 x i8], i64, i64, i64, i64, i64 }
 
 ; Function Attrs: nounwind uwtable
-define void @widget() #0 {
+define void @widget() {
 bb:
   %tmp2 = alloca %struct.hoge, align 1
   br label %bb3
@@ -40,8 +39,6 @@ bb13:                                             ; preds = %bb10
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259771)"}
diff --git a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
index 01b0176..5a11adc 100644
--- a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
+++ b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
@@ -35,7 +35,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @donestkptr = external global ptr, align 8
 
 ; Function Attrs: uwtable
-define i32 @_Z13dotableswitchP9Classfile(ptr %c) #0 personality ptr @__gxx_personality_v0 {
+define i32 @_Z13dotableswitchP9Classfile(ptr %c) personality ptr @__gxx_personality_v0 {
 entry:
   br label %entry.split
 
@@ -82,33 +82,27 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 }
 
 ; Function Attrs: nounwind readnone
-declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #1
+declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
 
 ; Function Attrs: nobuiltin
-declare noalias ptr @_Znam(i64) #2
+declare noalias ptr @_Znam(i64)
 
 ; Function Attrs: nobuiltin
-declare noalias ptr @_Znwm(i64) #2
+declare noalias ptr @_Znwm(i64)
 
 ; Function Attrs: uwtable
-declare void @_ZN3ExpC2Ejj7Exptype4Type2OpPS_jjP4Case(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr #0 align 2
+declare void @_ZN3ExpC2Ejj7Exptype4Type2OpPS_jjP4Case(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr align 2
 
 declare i32 @__gxx_personality_v0(...)
 
 ; Function Attrs: nobuiltin nounwind
-declare void @_ZdlPv(ptr) #3
+declare void @_ZdlPv(ptr)
 
 ; Function Attrs: uwtable
-declare i32 @_Z10doluswitchP9Classfile(ptr) #0
+declare i32 @_Z10doluswitchP9Classfile(ptr)
 
 ; Function Attrs: nounwind uwtable
-declare void @_ZN4Exp_C2E7Exptype4Type2Op(ptr, i32, i32, i32) unnamed_addr #4 align 2
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nobuiltin "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nobuiltin nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_ZN4Exp_C2E7Exptype4Type2Op(ptr, i32, i32, i32) unnamed_addr align 2
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/CodeGen/invariant_verify_function_failed.ll b/polly/test/CodeGen/invariant_verify_function_failed.ll
index c9affac..1dcc175 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed.ll
@@ -15,7 +15,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @fileblobSetFilename() #0 {
+define void @fileblobSetFilename() {
 entry:
   br i1 undef, label %if.end, label %cleanup
 
@@ -23,7 +23,7 @@ if.end:                                           ; preds = %entry
   br i1 undef, label %land.lhs.true, label %if.end.18
 
 land.lhs.true:                                    ; preds = %if.end
-  %call9 = tail call ptr @__errno_location() #2
+  %call9 = tail call ptr @__errno_location()
   %tmp = load i32, ptr %call9, align 4, !tbaa !1
   br i1 false, label %if.then.12, label %if.end.18
 
@@ -45,11 +45,7 @@ cleanup:                                          ; preds = %if.end.27, %entry
 }
 
 ; Function Attrs: nounwind readnone
-declare ptr @__errno_location() #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare ptr @__errno_location()
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/CodeGen/invariant_verify_function_failed_2.ll b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
index 7ef5608..43b3d99 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed_2.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
@@ -39,7 +39,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @enc_picture = external global ptr, align 8
 
 ; Function Attrs: nounwind uwtable
-define void @compute_colocated(ptr %listX, ptr %A, ptr %B) #0 {
+define void @compute_colocated(ptr %listX, ptr %A, ptr %B) {
 entry:
   br label %for.body2414
 
@@ -82,8 +82,6 @@ if.end2624:                                       ; preds = %for.inc2621
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0"}
diff --git a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
index d906585..f43247b 100644
--- a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
+++ b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; Just make sure this test passes correctly.
 
-define void @kernel_ludcmp(ptr %b, ptr %y) #0 {
+define void @kernel_ludcmp(ptr %b, ptr %y) {
 entry:
   br label %entry.split
 
@@ -115,8 +115,6 @@ for.end.131:                                      ; preds = %for.end.118
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/CodeGen/multiple-types-invariant-load-2.ll b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
index f6aca37..101fcaf 100644
--- a/polly/test/CodeGen/multiple-types-invariant-load-2.ll
+++ b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
@@ -5,7 +5,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @hoge(ptr %arg) #0 {
+define void @hoge(ptr %arg) {
 bb:
   br label %bb3
 
@@ -22,8 +22,6 @@ bb7:                                              ; preds = %bb4, %bb3
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259869)"}
diff --git a/polly/test/CodeGen/out-of-scop-phi-node-use.ll b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
index a4f9423..dd0a24b 100644
--- a/polly/test/CodeGen/out-of-scop-phi-node-use.ll
+++ b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NEXT: %_s.sroa.343.0.ph5161118.ph.final_reload = load i32, ptr %_s.sroa.343.0.ph5161118.s2a
 
 ; Function Attrs: nounwind uwtable
-define void @lzmaDecode() #0 {
+define void @lzmaDecode() {
 entry:
   br label %for.cond.outer.outer.outer
 
@@ -54,8 +54,6 @@ cleanup.1072:                                     ; preds = %for.cond
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/CodeGen/phi-defined-before-scop.ll b/polly/test/CodeGen/phi-defined-before-scop.ll
index 2361206..447a14e 100644
--- a/polly/test/CodeGen/phi-defined-before-scop.ll
+++ b/polly/test/CodeGen/phi-defined-before-scop.ll
@@ -14,7 +14,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @global = external global ptr, align 8
 
 ; Function Attrs: nounwind uwtable
-define void @wobble() #0 {
+define void @wobble() {
 bb:
   br label %bb1
 
@@ -41,8 +41,6 @@ bb9:                                              ; preds = %bb8
   unreachable
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/CodeGen/pr25241.ll b/polly/test/CodeGen/pr25241.ll
index 4a4add8..7547b0b 100644
--- a/polly/test/CodeGen/pr25241.ll
+++ b/polly/test/CodeGen/pr25241.ll
@@ -20,12 +20,11 @@
 ; CHECK:         %curr.3.ph.final_reload = load i32, ptr %curr.3.s2a
 ; CHECK:         br label
 
-
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @BZ2_decompress() #0 {
+define void @BZ2_decompress() {
 entry:
   %tmp = load i32, ptr undef, align 4, !tbaa !1
   switch i32 undef, label %save_state_and_return [
@@ -56,8 +55,6 @@ save_state_and_return:                            ; preds = %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
index 6c6c257..0adb0ba 100644
--- a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
+++ b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
@@ -31,7 +31,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @reduce_sidechannel = external global i32, align 4
 
 ; Function Attrs: nounwind uwtable
-define void @iteration_init(ptr %gfp, ptr %l3_side, ptr %l3_enc) #0 {
+define void @iteration_init(ptr %gfp, ptr %l3_side, ptr %l3_enc) {
 entry:
   %resvDrain = getelementptr inbounds %struct.III_side_info_t.7.62.139.227.293, ptr %l3_side, i64 0, i32 2
   store i32 0, ptr %resvDrain, align 8
@@ -86,5 +86,3 @@ for.inc.117:                                      ; preds = %for.inc.114, %for.c
   %cmp95 = icmp slt i64 %indvars.iv.next158, %6
   br i1 %cmp95, label %for.cond.98.preheader, label %for.cond.120.preheader
 }
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/polly/test/DependenceInfo/fine_grain_dep_0.ll b/polly/test/DependenceInfo/fine_grain_dep_0.ll
index f93814c1..5abbf48 100644
--- a/polly/test/DependenceInfo/fine_grain_dep_0.ll
+++ b/polly/test/DependenceInfo/fine_grain_dep_0.ll
@@ -31,7 +31,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @test(ptr %a, ptr %b, i64 %N) #0 {
+define void @test(ptr %a, ptr %b, i64 %N) {
 entry:
   br label %for.cond
 
@@ -67,8 +67,6 @@ for.end:                                          ; preds = %for.cond
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0"}
diff --git a/polly/test/ForwardOpTree/atax.ll b/polly/test/ForwardOpTree/atax.ll
index 496e8315..6c81fb1 100644
--- a/polly/test/ForwardOpTree/atax.ll
+++ b/polly/test/ForwardOpTree/atax.ll
@@ -2,7 +2,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define internal fastcc void @kernel_atax(ptr nocapture readonly %A, ptr nocapture readonly %x, ptr nocapture %y, ptr nocapture %tmp) unnamed_addr #0 {
+define internal fastcc void @kernel_atax(ptr nocapture readonly %A, ptr nocapture readonly %x, ptr nocapture %y, ptr nocapture %tmp) unnamed_addr {
 entry:
   br label %entry.split
 
@@ -61,10 +61,7 @@ for.end42:                                        ; preds = %for.inc40
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i32, i1) #1
-
-attributes #0 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i32, i1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
@@ -78,7 +75,6 @@ attributes #1 = { argmemonly nounwind }
 !6 = !{!7, !7, i64 0}
 !7 = !{!"double", !4, i64 0}
 
-
 ; CHECK: Statistics {
 ; CHECK:     Operand trees forwarded: 2
 ; CHECK:     Statements with forwarded operand trees: 2
diff --git a/polly/test/ForwardOpTree/jacobi-1d.ll b/polly/test/ForwardOpTree/jacobi-1d.ll
index c9c71a1..cb035bb 100644
--- a/polly/test/ForwardOpTree/jacobi-1d.ll
+++ b/polly/test/ForwardOpTree/jacobi-1d.ll
@@ -2,7 +2,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define internal fastcc void @kernel_jacobi_1d(ptr noalias nocapture %A, ptr noalias nocapture %B) unnamed_addr #0 {
+define internal fastcc void @kernel_jacobi_1d(ptr noalias nocapture %A, ptr noalias nocapture %B) unnamed_addr {
 entry:
   br label %entry.split
 
@@ -46,8 +46,6 @@ for.end35:                                        ; preds = %for.inc33
   ret void
 }
 
-attributes #0 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
@@ -60,7 +58,6 @@ attributes #0 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-
 !6 = !{!7, !7, i64 0}
 !7 = !{!"double", !4, i64 0}
 
-
 ; CHECK: Statistics {
 ; CHECK:     Operand trees forwarded: 2
 ; CHECK:     Statements with forwarded operand trees: 1
diff --git a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
index 8c3f230..aef509a 100644
--- a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
+++ b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
@@ -16,7 +16,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 $_ZNKSt5ctypeIcE5widenEc = comdat any
 
 ; Function Attrs: uwtable
-define weak_odr signext i8 @_ZNKSt5ctypeIcE5widenEc(ptr %this, i8 signext %__c) #0 comdat align 2 {
+define weak_odr signext i8 @_ZNKSt5ctypeIcE5widenEc(ptr %this, i8 signext %__c) comdat align 2 {
 entry:
   br label %entry.split
 
@@ -45,10 +45,7 @@ return:                                           ; preds = %if.end, %if.then
   ret i8 %retval.0
 }
 
-declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(ptr) #1
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(ptr)
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
index 6e9ade8..a19b93d 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -47,7 +47,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: norecurse nounwind uwtable
-define void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, ptr nocapture %tmp, ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readnone %C, ptr nocapture readnone %D) local_unnamed_addr #0 {
+define void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, ptr nocapture %tmp, ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readnone %C, ptr nocapture readnone %D) local_unnamed_addr {
 entry:
   br label %entry.split
 
@@ -93,8 +93,6 @@ for.end27:                                        ; preds = %for.inc25
   ret void
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vl,-avx512vpopcntdq,-clflushopt,-clwb,-clzero,-fma4,-lwp,-mwaitx,-pku,-prefetchwt1,-prfchw,-rdseed,-rtm,-sgx,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
index a18ba1d..1c6d289 100644
--- a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @B = common global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @foo() #0 {
+define void @foo() {
 entry:
   br label %entry.split
 
@@ -51,8 +51,6 @@ for.end30:                                        ; preds = %for.inc28
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 ; CHECK: #pragma known-parallel
 ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 383; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll
index 4db61ad..1ff20d1 100644
--- a/polly/test/ScheduleOptimizer/prevectorization.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @B = common global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @foo() #0 {
+define void @foo() {
 entry:
   br label %entry.split
 
@@ -53,8 +53,6 @@ for.end30:                                        ; preds = %for.inc28
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 ; CHECK: #pragma known-parallel
 ; CHECK: for (int c0 = 0; c0 <= 47; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 47; c1 += 1)
@@ -95,7 +93,6 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"=
 ; VEC16:                 Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5);
 ; VEC16: }
 
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.5.0 "}
diff --git a/polly/test/ScopDetect/error-block-always-executed.ll b/polly/test/ScopDetect/error-block-always-executed.ll
index d799d57..20d02b1 100644
--- a/polly/test/ScopDetect/error-block-always-executed.ll
+++ b/polly/test/ScopDetect/error-block-always-executed.ll
@@ -8,14 +8,14 @@ target triple = "x86_64-unknown-linux-gnu"
 %struct.hoge = type { i32, i32, i32, i32 }
 
 ; Function Attrs: nounwind uwtable
-define void @widget() #0 {
+define void @widget() {
 bb13:
   %tmp1 = alloca %struct.hoge, align 4
   br i1 undef, label %bb14, label %bb19
 
 bb14:                                             ; preds = %bb13
   %tmp = load i32, ptr undef, align 4, !tbaa !1
-  call void @quux() #2
+  call void @quux()
   br i1 false, label %bb15, label %bb18
 
 bb15:                                             ; preds = %bb14
@@ -46,11 +46,7 @@ bb25:                                             ; preds = %bb2
   unreachable
 }
 
-declare void @quux() #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare void @quux()
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopDetect/error-block-referenced-from-scop.ll b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
index ba271f3..6c66f6d 100644
--- a/polly/test/ScopDetect/error-block-referenced-from-scop.ll
+++ b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @hoge() #0 {
+define void @hoge() {
 bb:
   br label %bb1
 
@@ -15,7 +15,7 @@ bb1:                                              ; preds = %bb
 
 bb2:                                              ; preds = %bb1
   %tmp = load i32, ptr undef, align 8, !tbaa !1
-  %tmp3 = tail call i32 @widget() #2
+  %tmp3 = tail call i32 @widget()
   br i1 false, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb2
@@ -33,11 +33,7 @@ bb8:                                              ; preds = %bb7, %bb5, %bb4
 }
 
 ; Function Attrs: inlinehint nounwind readonly uwtable
-declare i32 @widget() #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inlinehint nounwind readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readonly }
+declare i32 @widget()
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopDetect/expand-region-correctly-2.ll b/polly/test/ScopDetect/expand-region-correctly-2.ll
index df35d056..a5c9626 100644
--- a/polly/test/ScopDetect/expand-region-correctly-2.ll
+++ b/polly/test/ScopDetect/expand-region-correctly-2.ll
@@ -5,7 +5,7 @@
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @qtm_decompress() #0 {
+define void @qtm_decompress() {
 entry:
   br label %if.end.1631
 
@@ -45,8 +45,6 @@ cleanup.1785:                                     ; preds = %if.then.1659
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/ScopDetect/intrinsics_1.ll b/polly/test/ScopDetect/intrinsics_1.ll
index 0f9c700..58c9197 100644
--- a/polly/test/ScopDetect/intrinsics_1.ll
+++ b/polly/test/ScopDetect/intrinsics_1.ll
@@ -15,7 +15,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @jd(ptr noalias %A, ptr noalias %B) #0 {
+define void @jd(ptr noalias %A, ptr noalias %B) {
 entry:
   br label %for.cond
 
@@ -28,29 +28,29 @@ for.body:                                         ; preds = %for.cond
   %tmp = trunc i64 %indvars.iv to i32
   %conv = sitofp i32 %tmp to double
   %tmp1 = call double @llvm.sqrt.f64(double %conv)
-  %call = call double @__log10_finite(double %tmp1) #2
-  %call1 = call double @ceil(double %call) #2
+  %call = call double @__log10_finite(double %tmp1)
+  %call1 = call double @ceil(double %call)
   %tmp2 = trunc i64 %indvars.iv to i32
   %conv2 = sitofp i32 %tmp2 to double
-  %call3 = call double @__log2_finite(double %conv2) #2
-  %call4 = call double @floor(double %call3) #2
+  %call3 = call double @__log2_finite(double %conv2)
+  %call4 = call double @floor(double %call3)
   %tmp3 = call double @llvm.pow.f64(double %call1, double %call4)
   %conv5 = fptosi double %tmp3 to i32
   %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
   store i32 %conv5, ptr %arrayidx, align 4
   %tmp4 = trunc i64 %indvars.iv to i32
   %conv6 = sitofp i32 %tmp4 to double
-  %call7 = call double @sin(double %conv6) #2
-  %call8 = call double @__log_finite(double %call7) #2
+  %call7 = call double @sin(double %conv6)
+  %call8 = call double @__log_finite(double %call7)
   %tmp5 = trunc i64 %indvars.iv to i32
   %conv9 = sitofp i32 %tmp5 to double
-  %call10 = call double @cos(double %conv9) #2
-  %call11 = call double @__exp2_finite(double %call10) #2
+  %call10 = call double @cos(double %conv9)
+  %call11 = call double @__exp2_finite(double %call10)
   %add = fadd fast double %call8, %call11
-  %call12 = call double @fabs(double %add) #2
+  %call12 = call double @fabs(double %add)
   %tmp6 = trunc i64 %indvars.iv to i32
   %conv13 = sitofp i32 %tmp6 to double
-  %call14 = call double @__exp_finite(double %conv13) #2
+  %call14 = call double @__exp_finite(double %conv13)
   %add15 = fadd fast double %call12, %call14
   %conv16 = fptrunc double %add15 to float
   %arrayidx18 = getelementptr inbounds float, ptr %B, i64 %indvars.iv
@@ -66,41 +66,37 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind readnone
-declare double @ceil(double) #1
+declare double @ceil(double)
 
 ; Function Attrs: nounwind readnone
-declare double @__log10_finite(double) #1
+declare double @__log10_finite(double)
 
 ; Function Attrs: nounwind readnone
-declare double @llvm.sqrt.f64(double) #2
+declare double @llvm.sqrt.f64(double)
 
 ; Function Attrs: nounwind readnone
-declare double @floor(double) #1
+declare double @floor(double)
 
 ; Function Attrs: nounwind readnone
-declare double @__log2_finite(double) #1
+declare double @__log2_finite(double)
 
 ; Function Attrs: nounwind readnone
-declare double @llvm.pow.f64(double, double) #2
+declare double @llvm.pow.f64(double, double)
 
 ; Function Attrs: nounwind readnone
-declare double @fabs(double) #1
+declare double @fabs(double)
 
 ; Function Attrs: nounwind readnone
-declare double @__log_finite(double) #1
+declare double @__log_finite(double)
 
 ; Function Attrs: nounwind readnone
-declare double @sin(double) #1
+declare double @sin(double)
 
 ; Function Attrs: nounwind readnone
-declare double @__exp2_finite(double) #1
+declare double @__exp2_finite(double)
 
 ; Function Attrs: nounwind readnone
-declare double @cos(double) #1
+declare double @cos(double)
 
 ; Function Attrs: nounwind readnone
-declare double @__exp_finite(double) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare double @__exp_finite(double)
diff --git a/polly/test/ScopDetect/intrinsics_2.ll b/polly/test/ScopDetect/intrinsics_2.ll
index 1db9807..f71016e 100644
--- a/polly/test/ScopDetect/intrinsics_2.ll
+++ b/polly/test/ScopDetect/intrinsics_2.ll
@@ -20,7 +20,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @A = common global [1024 x i32] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @jd() #0 {
+define void @jd() {
 entry:
   %tmp = alloca [1024 x i32], align 16
   br label %for.cond
@@ -31,7 +31,7 @@ for.cond:                                         ; preds = %for.inc11, %entry
   br i1 %exitcond5, label %for.body, label %for.end13
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start(i64 4096, ptr %tmp) #1
+  call void @llvm.lifetime.start(i64 4096, ptr %tmp)
   br label %for.cond2
 
 for.cond2:                                        ; preds = %for.inc, %for.body
@@ -62,7 +62,7 @@ for.end:                                          ; preds = %for.cond2
 
 for.inc11:                                        ; preds = %for.end
   %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
-  call void @llvm.lifetime.end(i64 4096, ptr %tmp) #1
+  call void @llvm.lifetime.end(i64 4096, ptr %tmp)
   br label %for.cond
 
 for.end13:                                        ; preds = %for.cond
@@ -70,10 +70,7 @@ for.end13:                                        ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
+declare void @llvm.lifetime.end(i64, ptr nocapture)
diff --git a/polly/test/ScopDetect/intrinsics_3.ll b/polly/test/ScopDetect/intrinsics_3.ll
index a230d0a..579d5bd 100644
--- a/polly/test/ScopDetect/intrinsics_3.ll
+++ b/polly/test/ScopDetect/intrinsics_3.ll
@@ -20,7 +20,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @A = common global [1024 x i32] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @jd() #0 {
+define void @jd() {
 entry:
   %tmp = alloca [1024 x i32], align 16
   br label %for.cond
@@ -31,7 +31,7 @@ for.cond:                                         ; preds = %for.inc11, %entry
   br i1 %exitcond5, label %for.body, label %for.end13
 
 for.body:                                         ; preds = %for.cond
-  %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A) #1
+  %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A)
   br label %for.cond2
 
 for.cond2:                                        ; preds = %for.inc, %for.body
@@ -60,7 +60,7 @@ for.end:                                          ; preds = %for.cond2
   %arrayidx8 = getelementptr inbounds [1024 x i32], ptr %tmp, i64 0, i64 %indvars.iv3
   %tmp8 = load i32, ptr %arrayidx8, align 4
   %arrayidx10 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %indvars.iv3
-  call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A) #1
+  call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A)
   store i32 %tmp8, ptr %arrayidx10, align 4
   br label %for.inc11
 
@@ -73,19 +73,16 @@ for.end13:                                        ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.donothing() #1
+declare void @llvm.donothing()
 
 ; Function Attrs: nounwind
-declare void @llvm.assume(i1) #1
+declare void @llvm.assume(i1)
 
 ; Function Attrs: nounwind
-declare i1 @llvm.expect.i1(i1, i1) #1
+declare i1 @llvm.expect.i1(i1, i1)
 
 ; Function Attrs: nounwind
-declare ptr @llvm.invariant.start(i64, ptr nocapture) #1
+declare ptr @llvm.invariant.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.invariant.end(ptr, i64, ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
+declare void @llvm.invariant.end(ptr, i64, ptr nocapture)
diff --git a/polly/test/ScopDetect/report-scop-location.ll b/polly/test/ScopDetect/report-scop-location.ll
index a99a2ef..5e4c38d 100644
--- a/polly/test/ScopDetect/report-scop-location.ll
+++ b/polly/test/ScopDetect/report-scop-location.ll
@@ -2,7 +2,7 @@
 target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @foo(ptr %A) #0 !dbg !4 {
+define void @foo(ptr %A) !dbg !4 {
 entry:
   br label %entry.split
 
@@ -28,7 +28,7 @@ for.end:                                          ; preds = %for.body
 ; CHECK: test.c:3: End of scop
 
 ; Function Attrs: nounwind uwtable
-define void @bar(ptr %A) #0 !dbg !7 {
+define void @bar(ptr %A) !dbg !7 {
 entry:
   br label %entry.split
 
@@ -53,8 +53,6 @@ for.end:                                          ; preds = %for.body
 ; CHECK: test.c:9: Start of scop
 ; CHECK: test.c:13: End of scop
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
@@ -77,4 +75,3 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"=
 !16 = distinct !DILexicalBlock(line: 9, column: 0, file: !1, scope: !7)
 !17 = !DILocation(line: 13, scope: !16)
 !18 = !DILocation(line: 14, scope: !7)
-
diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
index d22c3b6..30e5fb9 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
@@ -19,9 +19,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK-NEXT: remark: ReportIrreducibleRegion.c:9:4: Irreducible region encountered in control flow.
 ; CHECK-NEXT: remark: ReportIrreducibleRegion.c:9:4: Invalid Scop candidate ends here.
 
-
 ; Function Attrs: nounwind uwtable
-define void @foo(i32 %a, i32 %b) #0 !dbg !4 {
+define void @foo(i32 %a, i32 %b) !dbg !4 {
 entry:
   %a.addr = alloca i32, align 4
   %b.addr = alloca i32, align 4
@@ -66,10 +65,7 @@ if.end6:                                          ; preds = %if.end5, %entry
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
index cb91300..a96b64e 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
@@ -79,9 +79,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
index 3743bfa..6156efa 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
@@ -19,12 +19,10 @@
 
 ; CHECK: remark: ReportLoopHasNoExit.c:7:7: Loop cannot be handled because it has no exit.
 
-
-
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @func(i32 %param0, i32 %N, ptr %A) #0 !dbg !6 {
+define void @func(i32 %param0, i32 %N, ptr %A) !dbg !6 {
 entry:
   %param0.addr = alloca i32, align 4
   %N.addr = alloca i32, align 4
@@ -80,10 +78,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
index 832045f..13ac9d5 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
@@ -5,7 +5,6 @@
 ;     A[i*i] = 0;
 ; }
 
-
 ; CHECK: remark: ReportNonAffineAccess-01.c:2:7: The following errors keep this region from being a Scop.
 ; CHECK: remark: ReportNonAffineAccess-01.c:3:5: The array subscript of "A" is not affine
 ; CHECK: remark: ReportNonAffineAccess-01.c:3:5: Invalid Scop candidate ends here.
@@ -40,9 +39,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
index b951487..93e9e8b 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
@@ -74,9 +74,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; YAML: Args:
 ; YAML:   - String:          Invalid Scop candidate ends here.
 
-
 ; Function Attrs: nounwind uwtable
-define void @onlyWrite(ptr %A) #0 !dbg !4 {
+define void @onlyWrite(ptr %A) !dbg !4 {
 entry:
   call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !14, metadata !15), !dbg !16
   call void @llvm.dbg.value(metadata i64 0, i64 0, metadata !17, metadata !15), !dbg !20
@@ -102,10 +101,10 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind uwtable
-define void @onlyRead(ptr %A) #0 !dbg !10 {
+define void @onlyRead(ptr %A) !dbg !10 {
 entry:
   call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !29, metadata !15), !dbg !30
   call void @llvm.dbg.value(metadata i64 0, i64 0, metadata !31, metadata !15), !dbg !33
@@ -131,10 +130,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12}
diff --git a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
index c2efd61..5f296fa 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 %struct.b = type { ptr }
 
-define void @a(ptr nocapture readonly %A) #0 !dbg !4 {
+define void @a(ptr nocapture readonly %A) !dbg !4 {
 entry:
   br label %entry.split
 
@@ -57,10 +57,7 @@ for.end:                                          ; preds = %for.body
   ret void, !dbg !34
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20, !21}
diff --git a/polly/test/ScopInfo/BoundChecks/single-loop.ll b/polly/test/ScopInfo/BoundChecks/single-loop.ll
index 10a0a58..0b69beaa 100644
--- a/polly/test/ScopInfo/BoundChecks/single-loop.ll
+++ b/polly/test/ScopInfo/BoundChecks/single-loop.ll
@@ -38,7 +38,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @foo(i64 %n, ptr %A) #0 {
+define void @foo(i64 %n, ptr %A) {
 entry:
   br label %for.cond
 
@@ -51,7 +51,7 @@ for.body:                                         ; preds = %for.cond
   br i1 false, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
-  call void (...) @exception() #2
+  call void (...) @exception()
   unreachable
 
 if.end:                                           ; preds = %for.body
@@ -59,7 +59,7 @@ if.end:                                           ; preds = %for.body
   br i1 %cmp2, label %if.then.3, label %if.end.4
 
 if.then.3:                                        ; preds = %if.end
-  call void (...) @exception() #2
+  call void (...) @exception()
   unreachable
 
 if.end.4:                                         ; preds = %if.end
@@ -79,11 +79,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: noreturn
-declare void @exception(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noreturn nounwind }
+declare void @exception(...)
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopInfo/BoundChecks/two-loops.ll b/polly/test/ScopInfo/BoundChecks/two-loops.ll
index c85ac5b..f2ba17d 100644
--- a/polly/test/ScopInfo/BoundChecks/two-loops.ll
+++ b/polly/test/ScopInfo/BoundChecks/two-loops.ll
@@ -35,7 +35,7 @@
 ; AST:     {  /* original code */ }
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @foo(i64 %n, ptr %A) #0 {
+define void @foo(i64 %n, ptr %A) {
 entry:
   br label %for.cond
 
@@ -56,7 +56,7 @@ for.body.3:                                       ; preds = %for.cond.1
   br i1 false, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body.3
-  call void (...) @exception() #2
+  call void (...) @exception()
   unreachable
 
 if.end:                                           ; preds = %for.body.3
@@ -64,7 +64,7 @@ if.end:                                           ; preds = %for.body.3
   br i1 %cmp5, label %if.then.6, label %if.end.7
 
 if.then.6:                                        ; preds = %if.end
-  call void (...) @exception() #2
+  call void (...) @exception()
   unreachable
 
 if.end.7:                                         ; preds = %if.end
@@ -90,11 +90,7 @@ for.end.10:                                       ; preds = %for.cond
   ret void
 }
 
-declare void @exception(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noreturn nounwind }
+declare void @exception(...)
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopInfo/complex-expression.ll b/polly/test/ScopInfo/complex-expression.ll
index 6a6dde6..4a2a1d2 100644
--- a/polly/test/ScopInfo/complex-expression.ll
+++ b/polly/test/ScopInfo/complex-expression.ll
@@ -13,7 +13,7 @@
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 ; Function Attrs: norecurse nounwind
-define i32 @foo(ptr nocapture readonly %src1, ptr nocapture readonly %src2, ptr nocapture %score, ptr nocapture %max, i32 %n) #0 {
+define i32 @foo(ptr nocapture readonly %src1, ptr nocapture readonly %src2, ptr nocapture %score, ptr nocapture %max, i32 %n) {
 entry:
   %cmp33 = icmp sgt i32 %n, 0
   br i1 %cmp33, label %for.body.preheader, label %for.body7.preheader
@@ -129,8 +129,6 @@ cleanup:                                          ; preds = %for.body7.preheader
   ret i32 %retval.0
 }
 
-attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+strict-align" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
diff --git a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
index baa423f..a3ca595 100644
--- a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
+++ b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
@@ -6,7 +6,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @FORMAT3_4() #0 {
+define void @FORMAT3_4() {
 entry:
   br label %entry.split
 
@@ -14,7 +14,7 @@ entry.split:                                      ; preds = %entry
   br i1 false, label %if.end.38, label %if.else
 
 if.else:                                          ; preds = %entry.split
-  call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef) #2
+  call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef)
   %0 = load i32, ptr null, align 4
   br label %if.end.38
 
@@ -22,11 +22,7 @@ if.end.38:                                        ; preds = %if.else, %entry.spl
   unreachable
 }
 
-declare void @BYTES_TO_BITS(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare void @BYTES_TO_BITS(...)
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopInfo/early_exit_for_complex_domains.ll b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
index eed19b3..9a1edcb 100644
--- a/polly/test/ScopInfo/early_exit_for_complex_domains.ll
+++ b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
@@ -7,7 +7,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.regnode_charclass_class.2.42.654.690.726.870.978.1770.1806.1842.2166.2274.2382.2598.2814.3030.3064 = type { i8, i8, i16, i32, [32 x i8], [4 x i8] }
 
 ; Function Attrs: nounwind uwtable
-define void @S_cl_or(ptr %cl, ptr %or_with) #0 {
+define void @S_cl_or(ptr %cl, ptr %or_with) {
 entry:
   %0 = load i8, ptr %or_with, align 4, !tbaa !1
   %conv = zext i8 %0 to i32
@@ -37,8 +37,6 @@ if.end91:                                         ; preds = %for.body71, %for.bo
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0"}
diff --git a/polly/test/ScopInfo/expensive-boundary-context.ll b/polly/test/ScopInfo/expensive-boundary-context.ll
index 1a8858d..95212f8 100644
--- a/polly/test/ScopInfo/expensive-boundary-context.ll
+++ b/polly/test/ScopInfo/expensive-boundary-context.ll
@@ -16,7 +16,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.foo.1 = type { [3 x i32], [3 x i32], i32, i32, [4 x i32], [4 x i32] }
 
 ; Function Attrs: nounwind uwtable
-define void @hoge() #0 {
+define void @hoge() {
 bb:
   %tmp52 = alloca ptr, align 8
   %tmp53 = alloca ptr, align 8
@@ -250,8 +250,6 @@ bb245:                                            ; preds = %bb244, %bb232
   unreachable
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
 
 !0 = !{!"clang version 3.8.0 (trunk 252261) (llvm/trunk 252271)"}
diff --git a/polly/test/ScopInfo/intrinsics.ll b/polly/test/ScopInfo/intrinsics.ll
index c5bbacb..e6d9e73 100644
--- a/polly/test/ScopInfo/intrinsics.ll
+++ b/polly/test/ScopInfo/intrinsics.ll
@@ -14,7 +14,7 @@
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @fun() #0 {
+define void @fun() {
 entry:
   %A = alloca [1024 x i32], align 16
   br label %for.cond
@@ -40,7 +40,4 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.donothing() #1
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.donothing()
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
index 6027975..e32748a 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
@@ -13,13 +13,13 @@ target triple = "x86_64-unknown-linux-gnu"
 @global2 = external unnamed_addr constant [79 x i8], align 1
 @global3 = external unnamed_addr constant [57 x i8], align 1
 
-declare void @widget() #0
+declare void @widget()
 
 ; Function Attrs: nounwind
-declare void @quux(ptr, i64, ptr, ...) #1
+declare void @quux(ptr, i64, ptr, ...)
 
 ; Function Attrs: nounwind uwtable
-define void @hoge(ptr %A) #2 {
+define void @hoge(ptr %A) {
 bb:
   br label %bb15
 
@@ -39,7 +39,7 @@ bb19:                                             ; preds = %bb15
   br i1 %tmp22, label %bb24, label %bb23
 
 bb23:                                             ; preds = %bb19
-  call void @widget() #3
+  call void @widget()
   br label %bb24
 
 bb24:                                             ; preds = %bb23, %bb19, %bb15
@@ -57,7 +57,7 @@ bb29:                                             ; preds = %bb24
   br i1 %tmp32, label %bb33, label %bb34
 
 bb33:                                             ; preds = %bb29
-  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144) #3
+  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144)
   br label %bb34
 
 bb34:                                             ; preds = %bb33, %bb29, %bb24
@@ -84,7 +84,7 @@ bb43:                                             ; preds = %bb39
   br i1 %tmp47, label %bb48, label %bb49
 
 bb48:                                             ; preds = %bb43
-  call void @widget() #3
+  call void @widget()
   br label %bb49
 
 bb49:                                             ; preds = %bb48, %bb43, %bb39, %bb34
@@ -103,7 +103,7 @@ bb54:                                             ; preds = %bb49
   br i1 %tmp57, label %bb58, label %bb59
 
 bb58:                                             ; preds = %bb54
-  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global3) #3
+  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global3)
   br label %bb59
 
 bb59:                                             ; preds = %bb58, %bb54, %bb49
@@ -121,11 +121,6 @@ bb65:                                             ; preds = %bb64, %bb59
   ret void
 }
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 252261) (llvm/trunk 252271)"}
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
index 4ef5ef0..b32b87b 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
@@ -51,13 +51,13 @@ target triple = "x86_64-unknown-linux-gnu"
 @global2 = external unnamed_addr constant [79 x i8], align 1
 @global3 = external unnamed_addr constant [57 x i8], align 1
 
-declare void @widget() #0
+declare void @widget()
 
 ; Function Attrs: nounwind
-declare void @quux(ptr, i64, ptr, ...) #1
+declare void @quux(ptr, i64, ptr, ...)
 
 ; Function Attrs: nounwind uwtable
-define void @hoge(ptr %A) #2 {
+define void @hoge(ptr %A) {
 bb:
   br label %bb15
 
@@ -77,7 +77,7 @@ bb19:                                             ; preds = %bb15
   br i1 %tmp22, label %bb24, label %bb23
 
 bb23:                                             ; preds = %bb19
-  call void @widget() #3
+  call void @widget()
   br label %bb24
 
 bb24:                                             ; preds = %bb23, %bb19, %bb15
@@ -95,18 +95,13 @@ bb29:                                             ; preds = %bb24
   br i1 %tmp32, label %bb33, label %bb34
 
 bb33:                                             ; preds = %bb29
-  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144) #3
+  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144)
   br label %bb34
 
 bb34:                                             ; preds = %bb33, %bb29, %bb24
   ret void
 }
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 252261) (llvm/trunk 252271)"}
diff --git a/polly/test/ScopInfo/memcpy-raw-source.ll b/polly/test/ScopInfo/memcpy-raw-source.ll
index d9024cd..6c45b0d 100644
--- a/polly/test/ScopInfo/memcpy-raw-source.ll
+++ b/polly/test/ScopInfo/memcpy-raw-source.ll
@@ -8,10 +8,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @tonemasks = external global [17 x [6 x [56 x float]]], align 16
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i32, i1) #0
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i32, i1)
 
 ; Function Attrs: nounwind uwtable
-define void @setup_tone_curves() #1 {
+define void @setup_tone_curves() {
 entry:
   %workc = alloca [17 x [8 x [56 x float]]], align 16
   br label %for.cond7.preheader
@@ -46,9 +46,6 @@ for.inc104:                                       ; preds = %for.body74
   ret void
 }
 
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 4.0.0 (trunk 285057) (llvm/trunk 285063)"}
diff --git a/polly/test/ScopInfo/mismatching-array-dimensions.ll b/polly/test/ScopInfo/mismatching-array-dimensions.ll
index ed1e28c..f825cbf 100644
--- a/polly/test/ScopInfo/mismatching-array-dimensions.ll
+++ b/polly/test/ScopInfo/mismatching-array-dimensions.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define void @hoge(ptr %arg, ptr %arg5, i32 %arg6) #0 {
+define void @hoge(ptr %arg, ptr %arg5, i32 %arg6) {
 bb:
   br i1 undef, label %bb7, label %bb25
 
@@ -37,8 +37,6 @@ bb25:                                             ; preds = %bb21, %bb
   ret void
 }
 
-attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1, !2}
 !llvm.ident = !{!3}
 
diff --git a/polly/test/ScopInfo/multidim_srem.ll b/polly/test/ScopInfo/multidim_srem.ll
index c965e2c..88c8c6a 100644
--- a/polly/test/ScopInfo/multidim_srem.ll
+++ b/polly/test/ScopInfo/multidim_srem.ll
@@ -19,11 +19,9 @@
 ; CHECK-NEXT:             [n] -> { Stmt_for_body_8[i0, i1, i2] -> MemRef_A[1, i1, i2] : (1 + i0) mod 2 = 0; Stmt_for_body_8[i0, i1, i2] -> MemRef_A[0, i1, i2] : (i0) mod 2 = 0 };
 ; CHECK-NEXT: }
 
-
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-
-define void @foo(i64 %n, ptr %A) #0 {
+define void @foo(i64 %n, ptr %A) {
 entry:
   br label %entry.split
 
@@ -83,13 +81,10 @@ for.end.16:                                       ; preds = %for.inc.14
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
+declare void @llvm.lifetime.end(i64, ptr nocapture)
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopInfo/remarks.ll b/polly/test/ScopInfo/remarks.ll
index 2c173a3..10cc57a 100644
--- a/polly/test/ScopInfo/remarks.ll
+++ b/polly/test/ScopInfo/remarks.ll
@@ -41,7 +41,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @.str = private unnamed_addr constant [8 x i8] c"Printf!\00", align 1
 
-define void @valid(ptr %A, ptr %B, i32 %N, i32 %M, ptr %C, i32 %Debug) #0 !dbg !4 {
+define void @valid(ptr %A, ptr %B, i32 %N, i32 %M, ptr %C, i32 %Debug) !dbg !4 {
 entry:
   call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !23, metadata !24), !dbg !25
   call void @llvm.dbg.value(metadata ptr %B, i64 0, metadata !26, metadata !24), !dbg !27
@@ -105,7 +105,7 @@ for.end:                                          ; preds = %for.cond.3
   br i1 %tobool, label %if.end.18, label %if.then.17, !dbg !72
 
 if.then.17:                                       ; preds = %for.end
-  %call = call i32 (ptr, ...) @printf(ptr nonnull @.str) #3, !dbg !73
+  %call = call i32 (ptr, ...) @printf(ptr nonnull @.str), !dbg !73
   br label %if.end.18, !dbg !73
 
 if.end.18:                                        ; preds = %for.end, %if.then.17
@@ -120,11 +120,11 @@ for.end.21:                                       ; preds = %for.cond
   ret void, !dbg !76
 }
 
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare i32 @printf(ptr, ...) #2
+declare i32 @printf(ptr, ...)
 
-define void @invalid0(ptr %A) #0 !dbg !13 {
+define void @invalid0(ptr %A) !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !77, metadata !24), !dbg !78
   call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !79, metadata !24), !dbg !81
@@ -173,12 +173,7 @@ for.end.7:                                        ; preds = %for.cond
   ret void, !dbg !105
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20, !21}
diff --git a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
index 55192b5..fa0c81f 100644
--- a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
+++ b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @cfft2(i32 %n, ptr %A) local_unnamed_addr #0 {
+define void @cfft2(i32 %n, ptr %A) local_unnamed_addr {
 entry:
   br i1 true, label %for.body.lr.ph, label %for.end
 
@@ -49,8 +49,6 @@ for.end:                                          ; preds = %for.inc, %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0 (trunk 273249) (llvm/trunk 273255)"}
diff --git a/polly/test/ScopInfo/unnamed_stmts.ll b/polly/test/ScopInfo/unnamed_stmts.ll
index 5a18945..163170c 100644
--- a/polly/test/ScopInfo/unnamed_stmts.ll
+++ b/polly/test/ScopInfo/unnamed_stmts.ll
@@ -48,7 +48,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @vec3(i64 %n, ptr, ptr) #0 {
+define void @vec3(i64 %n, ptr, ptr) {
   br label %.split
 
 .split:                                           ; preds = %0
@@ -141,8 +141,6 @@ define void @vec3(i64 %n, ptr, ptr) #0 {
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"Ubuntu clang version 3.7.1-3ubuntu4 (tags/RELEASE_371/final) (based on LLVM 3.7.1)"}
diff --git a/polly/test/ScopInliner/ignore-declares.ll b/polly/test/ScopInliner/ignore-declares.ll
index 11722dc..5c0cfa1 100644
--- a/polly/test/ScopInliner/ignore-declares.ll
+++ b/polly/test/ScopInliner/ignore-declares.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadPolly -polly-detect-full-functions -polly-scop-inliner \
-; RUN: -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s
 
 ; Check that we do not crash if there are declares. We should skip function
 ; declarations and not try to query for domtree.
diff --git a/polly/test/ScopInliner/invariant-load-func.ll b/polly/test/ScopInliner/invariant-load-func.ll
index ffd2ec9..58c556a 100644
--- a/polly/test/ScopInliner/invariant-load-func.ll
+++ b/polly/test/ScopInliner/invariant-load-func.ll
@@ -1,12 +1,9 @@
-; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-scop-inliner \
-; RUN: -polly-invariant-load-hoisting '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that we inline a function that requires invariant load hoisting
 ; correctly.
 ; CHECK:    Max Loop Depth:  2
 
-; REQUIRES: pollyacc
-
 
 ; void to_be_inlined(int A[], int *begin, int *end) {
 ;     for(int i = *begin; i < *end; i++) {
diff --git a/polly/test/ScopInliner/simple-inline-loop.ll b/polly/test/ScopInliner/simple-inline-loop.ll
index a5e3483..f12798a 100644
--- a/polly/test/ScopInliner/simple-inline-loop.ll
+++ b/polly/test/ScopInliner/simple-inline-loop.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadPolly -polly-detect-full-functions -polly-scop-inliner \
-; RUN: -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s  2>&1 | FileCheck %s
 
 ; Check that we get the 2 nested loops by inlining `to_be_inlined` into
 ; `inline_site`.
diff --git a/polly/test/Simplify/phi_in_regionstmt.ll b/polly/test/Simplify/phi_in_regionstmt.ll
index 76efd48..2bb0573 100644
--- a/polly/test/Simplify/phi_in_regionstmt.ll
+++ b/polly/test/Simplify/phi_in_regionstmt.ll
@@ -14,7 +14,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @qmatrix = external local_unnamed_addr global [8 x ptr], align 16
 
 ; Function Attrs: nounwind uwtable
-define void @AssignQuantParam(ptr %pps) local_unnamed_addr #0 {
+define void @AssignQuantParam(ptr %pps) local_unnamed_addr {
 entry:
   br label %entry.split
 
@@ -46,8 +46,6 @@ if.else121.us.7:                                  ; preds = %if.else135.us.6, %i
   br label %if.end161
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 6.0.0 (trunk 308961)"}
@@ -59,5 +57,4 @@ attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="fals
 !6 = !{!"any pointer", !3, i64 0}
 !7 = !{!6, !6, i64 0}
 
-
 ; CHECK: SCoP could not be simplified
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 2d9433f..599bc4b 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -3093,8 +3093,8 @@ libc_support_library(
 libc_support_library(
     name = "__support_sincosf_utils",
     hdrs = [
-        "src/__support/math/sincosf_utils.h",
         "src/__support/math/sincosf_float_eval.h",
+        "src/__support/math/sincosf_utils.h",
     ],
     deps = [
         ":__support_fputil_double_double",
diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index b442731..522a2bd 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -110,6 +110,7 @@ libc_test_library(
         "//libc:__support_fputil_rounding_mode",
         "//libc:__support_libc_errno",
         "//libc:__support_macros_config",
+        "//libc:__support_macros_optimization",
         "//libc:__support_macros_properties_architectures",
         "//libc:hdr_fenv_macros",
         "//libc:hdr_math_macros",