[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.ltopipelinescoro-de-duplicate-coro-passes

Created using spr 1.3.4 [skip ci]
author: Vitaly Buka <vitalybuka@google.com> 2025-02-25 15:08:54 -0800
committer: Vitaly Buka <vitalybuka@google.com> 2025-02-25 15:08:54 -0800
commit: 8fc4020c788ab93eb0ff82bda3db87ac80065674 (patch)
tree: aa87a699129cd68757a8a6bd20fcb42e44755f36
parent: 250d83255545f68afe144a47eb985ea355e3ea44 (diff)
parent: 30a7c816ee5ca998da960c6ab98e72903de40592 (diff)
download: llvm-users/vitalybuka/spr/main.ltopipelinescoro-de-duplicate-coro-passes.zip
llvm-users/vitalybuka/spr/main.ltopipelinescoro-de-duplicate-coro-passes.tar.gz
llvm-users/vitalybuka/spr/main.ltopipelinescoro-de-duplicate-coro-passes.tar.bz2
382 files changed, 15136 insertions, 8238 deletions
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 64d59a5..231dd26 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -142,7 +142,7 @@ jobs:
             # 2-3 hours to build on macOS, much slower than on Linux.
             # The long build time causes the release build to time out on x86_64,
             # so we need to disable flang there.
-            target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_PROJECTS='clang;lld;lldb;clang-tools-extra;bolt;polly;mlir'"
+            target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_PROJECTS='clang;lld;lldb;clang-tools-extra;polly;mlir'"
           fi
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
         fi
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 8385551..ff5eb5c 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -15,6 +15,7 @@
 #include "bolt/Core/DynoStats.h"
 #include "bolt/Core/HashUtilities.h"
 #include "bolt/Core/MCPlusBuilder.h"
+#include "bolt/Utils/CommandLineOpts.h"
 #include "bolt/Utils/NameResolver.h"
 #include "bolt/Utils/NameShortener.h"
 #include "bolt/Utils/Utils.h"
@@ -1753,8 +1754,8 @@ void BinaryFunction::postProcessEntryPoints() {
     // In non-relocation mode there's potentially an external undetectable
     // reference to the entry point and hence we cannot move this entry
     // point. Optimizing without moving could be difficult.
-    // In BAT mode, register any known entry points for CFG construction.
-    if (!BC.HasRelocations && !BC.HasBATSection)
+    // In aggregation, register any known entry points for CFG construction.
+    if (!BC.HasRelocations && !opts::AggregateOnly)
       setSimple(false);
 
     const uint32_t Offset = KV.first;
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index a859f27..d20626b 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -831,9 +831,10 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
     ParentFunc = FromFunc;
   ParentFunc->SampleCountInBytes += Count * (Second.From - First.To);
 
+  const uint64_t FuncAddress = FromFunc->getAddress();
   std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
-      BAT ? BAT->getFallthroughsInTrace(FromFunc->getAddress(), First.To,
-                                        Second.From)
+      BAT && BAT->isBATFunction(FuncAddress)
+          ? BAT->getFallthroughsInTrace(FuncAddress, First.To, Second.From)
           : getFallthroughsInTrace(*FromFunc, First, Second, Count);
   if (!FTs) {
     LLVM_DEBUG(
diff --git a/bolt/test/X86/bolt-address-translation-yaml.test b/bolt/test/X86/bolt-address-translation-yaml.test
index 3778891..a6a212d 100644
--- a/bolt/test/X86/bolt-address-translation-yaml.test
+++ b/bolt/test/X86/bolt-address-translation-yaml.test
@@ -61,6 +61,11 @@ YAML-BAT-CHECK-NEXT:   - bid:   0
 YAML-BAT-CHECK-NEXT:     insns: 26
 YAML-BAT-CHECK-NEXT:     hash:  0xA900AE79CFD40000
 YAML-BAT-CHECK-NEXT:     succ:  [ { bid: 3, cnt: 0 }, { bid: 1, cnt: 0 } ]
+# Check fallthroughs in non-BAT function
+YAML-BAT-CHECK-NEXT:   - bid:   27
+YAML-BAT-CHECK-NEXT:     insns: 3
+YAML-BAT-CHECK-NEXT:     hash:  0x30A1EBA77A903F0
+YAML-BAT-CHECK-NEXT:     succ:  [ { bid: 28, cnt: 1 } ]
 # Calls from no-BAT to BAT function
 YAML-BAT-CHECK:        - bid:   28
 YAML-BAT-CHECK-NEXT:     insns: 13
diff --git a/bolt/test/X86/entry-point-fallthru.s b/bolt/test/X86/entry-point-fallthru.s
new file mode 100644
index 0000000..edf1424
--- /dev/null
+++ b/bolt/test/X86/entry-point-fallthru.s
@@ -0,0 +1,24 @@
+## Checks that fallthroughs spanning entry points are accepted in aggregation
+## mode.
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: ld.lld %t.o -o %t
+# RUN: link_fdata %s %t %t.preagg PREAGG
+# RUN: perf2bolt %t -p %t.preagg --pa -o %t.fdata | FileCheck %s
+# CHECK: traces mismatching disassembled function contents: 0
+
+	.globl main
+main:
+	.cfi_startproc
+	vmovaps %zmm31,%zmm3
+
+next:
+	add    $0x4,%r9
+	add    $0x40,%r10
+	dec    %r14
+Ljmp:
+	jne    main
+# PREAGG: T #Ljmp# #main# #Ljmp# 1
+	ret
+	.cfi_endproc
+.size main,.-main
diff --git a/clang/Maintainers.rst b/clang/Maintainers.rst
index f9732aa..9eaa1ec 100644
--- a/clang/Maintainers.rst
+++ b/clang/Maintainers.rst
@@ -136,6 +136,7 @@ Clang static analyzer
 
 | Balázs Benics
 | benicsbalazs\@gmail.com (email), steakhal (Phabricator), steakhal (GitHub)
+| balazs.benics\@sonarsource.com (email), balazs-benics-sonarsource (GitHub)
 
 Compiler options
 ~~~~~~~~~~~~~~~~
diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index a1c68fc..aedbd1a 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -29,6 +29,13 @@ endfunction()
 # cache file to CMake via -C. e.g.
 #
 # cmake -D LLVM_RELEASE_ENABLE_PGO=ON -C Release.cmake
+
+set (DEFAULT_PROJECTS "clang;lld;lldb;clang-tools-extra;polly;mlir;flang")
+# bolt only supports ELF, so only enable it for Linux.
+if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
+  list(APPEND DEFAULT_PROJECTS "bolt")
+endif()
+
 set (DEFAULT_RUNTIMES "compiler-rt;libcxx")
 if (NOT WIN32)
   list(APPEND DEFAULT_RUNTIMES "libcxxabi" "libunwind")
@@ -36,7 +43,7 @@ endif()
 set(LLVM_RELEASE_ENABLE_LTO THIN CACHE STRING "")
 set(LLVM_RELEASE_ENABLE_PGO ON CACHE BOOL "")
 set(LLVM_RELEASE_ENABLE_RUNTIMES ${DEFAULT_RUNTIMES} CACHE STRING "")
-set(LLVM_RELEASE_ENABLE_PROJECTS "clang;lld;lldb;clang-tools-extra;bolt;polly;mlir;flang" CACHE STRING "")
+set(LLVM_RELEASE_ENABLE_PROJECTS ${DEFAULT_PROJECTS} CACHE STRING "")
 # Note we don't need to add install here, since it is one of the pre-defined
 # steps.
 set(LLVM_RELEASE_FINAL_STAGE_TARGETS "clang;package;check-all;check-llvm;check-clang" CACHE STRING "")
@@ -118,6 +125,11 @@ if(NOT ${CMAKE_HOST_SYSTEM_NAME} MATCHES "Darwin")
   set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -static-libgcc")
 endif()
 
+# Set flags for bolt
+if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
+  set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -Wl,--emit-relocs,-znow")
+endif()
+
 set_instrument_and_final_stage_var(CMAKE_EXE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
 set_instrument_and_final_stage_var(CMAKE_SHARED_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
 set_instrument_and_final_stage_var(CMAKE_MODULE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
@@ -125,6 +137,9 @@ set_instrument_and_final_stage_var(CMAKE_MODULE_LINKER_FLAGS ${RELEASE_LINKER_FL
 # Final Stage Config (stage2)
 set_final_stage_var(LLVM_ENABLE_RUNTIMES "${LLVM_RELEASE_ENABLE_RUNTIMES}" STRING)
 set_final_stage_var(LLVM_ENABLE_PROJECTS "${LLVM_RELEASE_ENABLE_PROJECTS}" STRING)
+if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
+  set_final_stage_var(CLANG_BOLT "INSTRUMENT" STRING)
+endif()
 set_final_stage_var(CPACK_GENERATOR "TXZ" STRING)
 set_final_stage_var(CPACK_ARCHIVE_THREADS "0" STRING)
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 657340c..3a0eab6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -124,6 +124,8 @@ Removed Compiler Flags
 
 Attribute Changes in Clang
 --------------------------
+Adding [[clang::unsafe_buffer_usage]] attribute to a method definition now turns off all -Wunsafe-buffer-usage
+related warnings within the method body.
 
 - The ``no_sanitize`` attribute now accepts both ``gnu`` and ``clang`` names.
 - Clang now diagnoses use of declaration attributes on void parameters. (#GH108819)
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index b4a961d..f03241a 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -10,6 +10,8 @@
 #define LLVM_CLANG_CIR_DIALECT_BUILDER_CIRBASEBUILDER_H
 
 #include "clang/CIR/Dialect/IR/CIRAttrs.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -23,6 +25,14 @@ public:
   CIRBaseBuilderTy(mlir::MLIRContext &mlirContext)
       : mlir::OpBuilder(&mlirContext) {}
 
+  cir::ConstantOp getBool(bool state, mlir::Location loc) {
+    return create<cir::ConstantOp>(loc, getBoolTy(), getCIRBoolAttr(state));
+  }
+  cir::ConstantOp getFalse(mlir::Location loc) { return getBool(false, loc); }
+  cir::ConstantOp getTrue(mlir::Location loc) { return getBool(true, loc); }
+
+  cir::BoolType getBoolTy() { return cir::BoolType::get(getContext()); }
+
   cir::PointerType getPointerTo(mlir::Type ty) {
     return cir::PointerType::get(getContext(), ty);
   }
@@ -31,6 +41,10 @@ public:
     return getPointerTo(cir::VoidType::get(getContext()));
   }
 
+  cir::BoolAttr getCIRBoolAttr(bool state) {
+    return cir::BoolAttr::get(getContext(), getBoolTy(), state);
+  }
+
   mlir::TypedAttr getConstPtrAttr(mlir::Type type, int64_t value) {
     auto valueAttr = mlir::IntegerAttr::get(
         mlir::IntegerType::get(type.getContext(), 64), value);
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index bd1665e..097616b 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -36,6 +36,25 @@ class CIRUnitAttr<string name, string attrMnemonic, list<Trait> traits = []>
 }
 
 //===----------------------------------------------------------------------===//
+// BoolAttr
+//===----------------------------------------------------------------------===//
+
+def CIR_BoolAttr : CIR_Attr<"Bool", "bool", [TypedAttrInterface]> {
+  let summary = "Represent true/false for !cir.bool types";
+  let description = [{
+    The BoolAttr represents a 'true' or 'false' value.
+  }];
+
+  let parameters = (ins AttributeSelfTypeParameter<
+                        "", "cir::BoolType">:$type,
+                    "bool":$value);
+
+  let assemblyFormat = [{
+    `<` $value `>`
+  }];
+}
+
+//===----------------------------------------------------------------------===//
 // IntegerAttr
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index a32fb3c..fc8edbc 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -267,6 +267,20 @@ def CIR_PointerType : CIR_Type<"Pointer", "ptr",
 }
 
 //===----------------------------------------------------------------------===//
+// BoolType
+//===----------------------------------------------------------------------===//
+
+def CIR_BoolType :
+    CIR_Type<"Bool", "bool",
+             [DeclareTypeInterfaceMethods<DataLayoutTypeInterface>]> {
+
+  let summary = "CIR bool type";
+  let description = [{
+    `cir.bool` represents C++ bool type.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
 // FuncType
 //===----------------------------------------------------------------------===//
 
@@ -355,7 +369,8 @@ def VoidPtr : Type<
 //===----------------------------------------------------------------------===//
 
 def CIR_AnyType : AnyTypeOf<[
-  CIR_VoidType, CIR_IntType, CIR_AnyFloat, CIR_PointerType, CIR_FuncType
+  CIR_VoidType, CIR_BoolType, CIR_IntType, CIR_AnyFloat, CIR_PointerType,
+  CIR_FuncType
 ]>;
 
 #endif // MLIR_CIR_DIALECT_CIR_TYPES
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index b5745e5..74f5d6eb 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -1674,6 +1674,7 @@ bool Compiler<Emitter>::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) {
   const Expr *LHS = E->getLHS();
   const Expr *RHS = E->getRHS();
   const Expr *Index = E->getIdx();
+  const Expr *Base = E->getBase();
 
   if (DiscardResult)
     return this->discard(LHS) && this->discard(RHS);
@@ -1682,8 +1683,17 @@ bool Compiler<Emitter>::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) {
   // side is the base.
   bool Success = true;
   for (const Expr *SubExpr : {LHS, RHS}) {
-    if (!this->visit(SubExpr))
+    if (!this->visit(SubExpr)) {
       Success = false;
+      continue;
+    }
+
+    // Expand the base if this is a subscript on a
+    // pointer expression.
+    if (SubExpr == Base && Base->getType()->isPointerType()) {
+      if (!this->emitExpandPtr(E))
+        Success = false;
+    }
   }
 
   if (!Success)
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 31126e4..5e0d2e9 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -139,17 +139,19 @@ static bool CheckActive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
   Pointer U = Ptr.getBase();
   Pointer C = Ptr;
-  while (!U.isRoot() && U.inUnion() && !U.isActive()) {
-    if (U.getField())
-      C = U;
+  while (!U.isRoot() && !U.isActive()) {
+    // A little arbitrary, but this is what the current interpreter does.
+    // See the AnonymousUnion test in test/AST/ByteCode/unions.cpp.
+    // GCC's output is more similar to what we would get without
+    // this condition.
+    if (U.getRecord() && U.getRecord()->isAnonymousUnion())
+      break;
+
+    C = U;
     U = U.getBase();
   }
   assert(C.isField());
 
-  // Get the inactive field descriptor.
-  const FieldDecl *InactiveField = C.getField();
-  assert(InactiveField);
-
   // Consider:
   // union U {
   //   struct {
@@ -165,6 +167,11 @@ static bool CheckActive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   if (!U.getFieldDesc()->isUnion())
     return true;
 
+  // Get the inactive field descriptor.
+  assert(!C.isActive());
+  const FieldDecl *InactiveField = C.getField();
+  assert(InactiveField);
+
   // Find the active field of the union.
   const Record *R = U.getRecord();
   assert(R && R->isUnion() && "Not a union");
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index fa113aa..db35208 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -2589,7 +2589,10 @@ inline bool NarrowPtr(InterpState &S, CodePtr OpPC) {
 
 inline bool ExpandPtr(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  S.Stk.push<Pointer>(Ptr.expand());
+  if (Ptr.isBlockPointer())
+    S.Stk.push<Pointer>(Ptr.expand());
+  else
+    S.Stk.push<Pointer>(Ptr);
   return true;
 }
 
diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index 3033bd4..92cfa19 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -437,28 +437,35 @@ void Pointer::activate() const {
   if (!getInlineDesc()->InUnion)
     return;
 
-  getInlineDesc()->IsActive = true;
+  auto activate = [](Pointer &P) -> void {
+    P.getInlineDesc()->IsActive = true;
+  };
+  auto deactivate = [](Pointer &P) -> void {
+    P.getInlineDesc()->IsActive = false;
+  };
 
-  // Get the union, iterate over its fields and DEactivate all others.
+  // Unions might be nested etc., so find the topmost Pointer that's
+  // not in a union anymore.
   Pointer UnionPtr = getBase();
-  while (!UnionPtr.getFieldDesc()->isUnion())
+  while (!UnionPtr.isRoot() && UnionPtr.inUnion())
     UnionPtr = UnionPtr.getBase();
 
+  assert(UnionPtr.getFieldDesc()->isUnion());
+
   const Record *UnionRecord = UnionPtr.getRecord();
   for (const Record::Field &F : UnionRecord->fields()) {
     Pointer FieldPtr = UnionPtr.atField(F.Offset);
     if (FieldPtr == *this) {
     } else {
-      FieldPtr.getInlineDesc()->IsActive = false;
+      deactivate(FieldPtr);
       // FIXME: Recurse.
     }
   }
 
-  Pointer B = getBase();
-  while (!B.isRoot() && B.inUnion()) {
+  Pointer B = *this;
+  while (B != UnionPtr) {
+    activate(B);
     // FIXME: Need to de-activate other fields of parent records.
-    B.getInlineDesc()->IsActive = true;
-    assert(B.isActive());
     B = B.getBase();
   }
 }
diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h
index 3970d58..fd33ee9 100644
--- a/clang/lib/AST/ByteCode/Pointer.h
+++ b/clang/lib/AST/ByteCode/Pointer.h
@@ -494,9 +494,6 @@ public:
   /// Returns the field information.
   const FieldDecl *getField() const { return getFieldDesc()->asFieldDecl(); }
 
-  /// Checks if the object is a union.
-  bool isUnion() const;
-
   /// Checks if the storage is extern.
   bool isExtern() const {
     if (isBlockPointer())
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 61f9ef7..610efa1 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -399,6 +399,8 @@ public:
     HasLegalHalfType = true;
     HasFloat16 = true;
     HalfArgsAndReturns = true;
+
+    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
   }
 
   bool hasBFloat16Type() const override { return true; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index b802705..24a9591 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -58,6 +58,13 @@ public:
         cgf.getLoc(e->getExprLoc()), type,
         builder.getAttr<cir::IntAttr>(type, e->getValue()));
   }
+
+  mlir::Value VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *e) {
+    mlir::Type type = cgf.convertType(e->getType());
+    return builder.create<cir::ConstantOp>(
+        cgf.getLoc(e->getExprLoc()), type,
+        builder.getCIRBoolAttr(e->getValue()));
+  }
 };
 } // namespace
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index c1d3265..d8acc99 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -141,7 +141,11 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
       if (APValue *value = initDecl->evaluateValue()) {
         switch (value->getKind()) {
         case APValue::Int: {
-          initializer = builder.getAttr<cir::IntAttr>(type, value->getInt());
+          if (mlir::isa<cir::BoolType>(type))
+            initializer =
+                builder.getCIRBoolAttr(value->getInt().getZExtValue());
+          else
+            initializer = builder.getAttr<cir::IntAttr>(type, value->getInt());
           break;
         }
         case APValue::Float: {
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index 551b43e..16aec10 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -108,6 +108,11 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
       resultType = cgm.VoidTy;
       break;
 
+    // bool
+    case BuiltinType::Bool:
+      resultType = cir::BoolType::get(&getMLIRContext());
+      break;
+
     // Signed integral types.
     case BuiltinType::Char_S:
     case BuiltinType::Int:
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 10ad7fb..bfc74d4 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -25,6 +25,23 @@ using namespace cir;
 //===----------------------------------------------------------------------===//
 // CIR Dialect
 //===----------------------------------------------------------------------===//
+namespace {
+struct CIROpAsmDialectInterface : public OpAsmDialectInterface {
+  using OpAsmDialectInterface::OpAsmDialectInterface;
+
+  AliasResult getAlias(Type type, raw_ostream &os) const final {
+    return AliasResult::NoAlias;
+  }
+
+  AliasResult getAlias(Attribute attr, raw_ostream &os) const final {
+    if (auto boolAttr = mlir::dyn_cast<cir::BoolAttr>(attr)) {
+      os << (boolAttr.getValue() ? "true" : "false");
+      return AliasResult::FinalAlias;
+    }
+    return AliasResult::NoAlias;
+  }
+};
+} // namespace
 
 void cir::CIRDialect::initialize() {
   registerTypes();
@@ -33,6 +50,7 @@ void cir::CIRDialect::initialize() {
 #define GET_OP_LIST
 #include "clang/CIR/Dialect/IR/CIROps.cpp.inc"
       >();
+  addInterfaces<CIROpAsmDialectInterface>();
 }
 
 //===----------------------------------------------------------------------===//
@@ -112,6 +130,13 @@ static LogicalResult checkConstantTypes(mlir::Operation *op, mlir::Type opType,
     return success();
   }
 
+  if (mlir::isa<cir::BoolAttr>(attrType)) {
+    if (!mlir::isa<cir::BoolType>(opType))
+      return op->emitOpError("result type (")
+             << opType << ") must be '!cir.bool' for '" << attrType << "'";
+    return success();
+  }
+
   if (mlir::isa<cir::IntAttr, cir::FPAttr>(attrType)) {
     auto at = cast<TypedAttr>(attrType);
     if (at.getType() != opType) {
diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
index 48be11b..d1b143e 100644
--- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
@@ -125,12 +125,6 @@ uint64_t IntType::getABIAlignment(const mlir::DataLayout &dataLayout,
   return (uint64_t)(getWidth() / 8);
 }
 
-uint64_t
-IntType::getPreferredAlignment(const ::mlir::DataLayout &dataLayout,
-                               ::mlir::DataLayoutEntryListRef params) const {
-  return (uint64_t)(getWidth() / 8);
-}
-
 mlir::LogicalResult
 IntType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                 unsigned width, bool isSigned) {
@@ -163,12 +157,6 @@ SingleType::getABIAlignment(const mlir::DataLayout &dataLayout,
   return (uint64_t)(getWidth() / 8);
 }
 
-uint64_t
-SingleType::getPreferredAlignment(const ::mlir::DataLayout &dataLayout,
-                                  ::mlir::DataLayoutEntryListRef params) const {
-  return (uint64_t)(getWidth() / 8);
-}
-
 const llvm::fltSemantics &DoubleType::getFloatSemantics() const {
   return llvm::APFloat::IEEEdouble();
 }
@@ -185,12 +173,6 @@ DoubleType::getABIAlignment(const mlir::DataLayout &dataLayout,
   return (uint64_t)(getWidth() / 8);
 }
 
-uint64_t
-DoubleType::getPreferredAlignment(const ::mlir::DataLayout &dataLayout,
-                                  ::mlir::DataLayoutEntryListRef params) const {
-  return (uint64_t)(getWidth() / 8);
-}
-
 const llvm::fltSemantics &FP16Type::getFloatSemantics() const {
   return llvm::APFloat::IEEEhalf();
 }
@@ -206,12 +188,6 @@ uint64_t FP16Type::getABIAlignment(const mlir::DataLayout &dataLayout,
   return (uint64_t)(getWidth() / 8);
 }
 
-uint64_t
-FP16Type::getPreferredAlignment(const ::mlir::DataLayout &dataLayout,
-                                ::mlir::DataLayoutEntryListRef params) const {
-  return (uint64_t)(getWidth() / 8);
-}
-
 const llvm::fltSemantics &BF16Type::getFloatSemantics() const {
   return llvm::APFloat::BFloat();
 }
@@ -227,12 +203,6 @@ uint64_t BF16Type::getABIAlignment(const mlir::DataLayout &dataLayout,
   return (uint64_t)(getWidth() / 8);
 }
 
-uint64_t
-BF16Type::getPreferredAlignment(const ::mlir::DataLayout &dataLayout,
-                                ::mlir::DataLayoutEntryListRef params) const {
-  return (uint64_t)(getWidth() / 8);
-}
-
 const llvm::fltSemantics &FP80Type::getFloatSemantics() const {
   return llvm::APFloat::x87DoubleExtended();
 }
@@ -249,12 +219,6 @@ uint64_t FP80Type::getABIAlignment(const mlir::DataLayout &dataLayout,
   return 16;
 }
 
-uint64_t
-FP80Type::getPreferredAlignment(const ::mlir::DataLayout &dataLayout,
-                                ::mlir::DataLayoutEntryListRef params) const {
-  return 16;
-}
-
 const llvm::fltSemantics &FP128Type::getFloatSemantics() const {
   return llvm::APFloat::IEEEquad();
 }
@@ -270,12 +234,6 @@ uint64_t FP128Type::getABIAlignment(const mlir::DataLayout &dataLayout,
   return 16;
 }
 
-uint64_t
-FP128Type::getPreferredAlignment(const ::mlir::DataLayout &dataLayout,
-                                 ::mlir::DataLayoutEntryListRef params) const {
-  return 16;
-}
-
 const llvm::fltSemantics &LongDoubleType::getFloatSemantics() const {
   return mlir::cast<cir::CIRFPTypeInterface>(getUnderlying())
       .getFloatSemantics();
@@ -295,13 +253,6 @@ LongDoubleType::getABIAlignment(const mlir::DataLayout &dataLayout,
       .getABIAlignment(dataLayout, params);
 }
 
-uint64_t LongDoubleType::getPreferredAlignment(
-    const ::mlir::DataLayout &dataLayout,
-    mlir::DataLayoutEntryListRef params) const {
-  return mlir::cast<mlir::DataLayoutTypeInterface>(getUnderlying())
-      .getPreferredAlignment(dataLayout, params);
-}
-
 LogicalResult
 LongDoubleType::verify(function_ref<InFlightDiagnostic()> emitError,
                        mlir::Type underlying) {
@@ -382,6 +333,22 @@ llvm::ArrayRef<mlir::Type> FuncType::getReturnTypes() const {
 bool FuncType::isVoid() const { return mlir::isa<VoidType>(getReturnType()); }
 
 //===----------------------------------------------------------------------===//
+// BoolType
+//===----------------------------------------------------------------------===//
+
+llvm::TypeSize
+BoolType::getTypeSizeInBits(const ::mlir::DataLayout &dataLayout,
+                            ::mlir::DataLayoutEntryListRef params) const {
+  return llvm::TypeSize::getFixed(8);
+}
+
+uint64_t
+BoolType::getABIAlignment(const ::mlir::DataLayout &dataLayout,
+                          ::mlir::DataLayoutEntryListRef params) const {
+  return 1;
+}
+
+//===----------------------------------------------------------------------===//
 // PointerType Definitions
 //===----------------------------------------------------------------------===//
 
@@ -399,13 +366,6 @@ PointerType::getABIAlignment(const ::mlir::DataLayout &dataLayout,
   return 8;
 }
 
-uint64_t PointerType::getPreferredAlignment(
-    const ::mlir::DataLayout &dataLayout,
-    ::mlir::DataLayoutEntryListRef params) const {
-  // FIXME: improve this in face of address spaces
-  return 8;
-}
-
 mlir::LogicalResult
 PointerType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                     mlir::Type pointee) {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 96d4e97..65fac01 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -22524,11 +22524,11 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     Value *Tag = EmitScalarExpr(E->getArg(0));
     Value *Obj = EmitScalarExpr(E->getArg(1));
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_throw);
-    return Builder.CreateCall(Callee, {Tag, Obj});
+    return EmitRuntimeCallOrInvoke(Callee, {Tag, Obj});
   }
   case WebAssembly::BI__builtin_wasm_rethrow: {
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_rethrow);
-    return Builder.CreateCall(Callee);
+    return EmitRuntimeCallOrInvoke(Callee);
   }
   case WebAssembly::BI__builtin_wasm_memory_atomic_wait32: {
     Value *Addr = EmitScalarExpr(E->getArg(0));
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 2601aa57..52addb7 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -267,18 +267,18 @@
                   : "0"(__leaf), "2"(__count))
 #else
 /* x86-64 uses %rbx as the base register, so preserve it. */
-#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
-    __asm("  xchgq  %%rbx,%q1\n" \
-          "  cpuid\n" \
-          "  xchgq  %%rbx,%q1" \
-        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx)                            \
+  __asm("  xchg{q|}  {%%|}rbx,%q1\n"                                           \
+        "  cpuid\n"                                                            \
+        "  xchg{q|}  {%%|}rbx,%q1"                                             \
+        : "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
         : "0"(__leaf))
 
-#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
-    __asm("  xchgq  %%rbx,%q1\n" \
-          "  cpuid\n" \
-          "  xchgq  %%rbx,%q1" \
-        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx)             \
+  __asm("  xchg{q|}  {%%|}rbx,%q1\n"                                           \
+        "  cpuid\n"                                                            \
+        "  xchg{q|}  {%%|}rbx,%q1"                                             \
+        : "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
         : "0"(__leaf), "2"(__count))
 #endif
 
@@ -289,20 +289,22 @@ static __inline unsigned int __get_cpuid_max (unsigned int __leaf,
 #ifdef __i386__
     int __cpuid_supported;
 
-    __asm("  pushfl\n"
-          "  popl   %%eax\n"
-          "  movl   %%eax,%%ecx\n"
-          "  xorl   $0x00200000,%%eax\n"
-          "  pushl  %%eax\n"
-          "  popfl\n"
-          "  pushfl\n"
-          "  popl   %%eax\n"
-          "  movl   $0,%0\n"
-          "  cmpl   %%eax,%%ecx\n"
+    __asm("  pushf{l|d}\n"
+          "  pop{l|}   {%%|}eax\n"
+          "  mov{l|}   {%%eax,%%ecx|ecx,eax}\n"
+          "  xor{l|}   {$0x00200000,%%eax|eax,0x00200000}\n"
+          "  push{l|}  {%%|}eax\n"
+          "  popf{l|d}\n"
+          "  pushf{l|d}\n"
+          "  pop{l|}   {%%|}eax\n"
+          "  mov{l|}   {$0,%0|%0,0}\n"
+          "  cmp{l|}   {%%eax,%%ecx|ecx,eax}\n"
           "  je     1f\n"
-          "  movl   $1,%0\n"
+          "  mov{l|}   {$1,%0|%0,1}\n"
           "1:"
-        : "=r" (__cpuid_supported) : : "eax", "ecx");
+          : "=r"(__cpuid_supported)
+          :
+          : "eax", "ecx");
     if (!__cpuid_supported)
         return 0;
 #endif
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index bf8fe44..6fc477d 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -149,11 +149,17 @@ std::vector<bool> HeaderSearch::collectVFSUsageAndClear() const {
 
   llvm::vfs::FileSystem &RootFS = FileMgr.getVirtualFileSystem();
   // TODO: This only works if the `RedirectingFileSystem`s were all created by
-  //       `createVFSFromOverlayFiles`.
+  //       `createVFSFromOverlayFiles`. But at least exclude the ones with null
+  //       OverlayFileDir.
   RootFS.visit([&](llvm::vfs::FileSystem &FS) {
     if (auto *RFS = dyn_cast<llvm::vfs::RedirectingFileSystem>(&FS)) {
-      VFSUsage.push_back(RFS->hasBeenUsed());
-      RFS->clearHasBeenUsed();
+      // Skip a `RedirectingFileSystem` with null OverlayFileDir which indicates
+      // that they aren't created by createVFSFromOverlayFiles from the overlays
+      // in HeaderSearchOption::VFSOverlayFiles.
+      if (!RFS->getOverlayFileDir().empty()) {
+        VFSUsage.push_back(RFS->hasBeenUsed());
+        RFS->clearHasBeenUsed();
+      }
     }
   });
   assert(VFSUsage.size() == getHeaderSearchOpts().VFSOverlayFiles.size() &&
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index f21e571..afdc0ea 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2566,6 +2566,9 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
 
   // The Callback function that performs analyses:
   auto CallAnalyzers = [&](const Decl *Node) -> void {
+    if (Node->hasAttr<UnsafeBufferUsageAttr>())
+      return;
+
     // Perform unsafe buffer usage analysis:
     if (!Diags.isIgnored(diag::warn_unsafe_buffer_operation,
                          Node->getBeginLoc()) ||
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 81209f2..f9926c6 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -7390,10 +7390,11 @@ bool DecomposePrintfHandler::GetSpecifiers(
   const char *Str = Data.data();
   llvm::SmallBitVector BV;
   UncoveredArgHandler UA;
+  const Expr *PrintfArgs[] = {FSL->getFormatString()};
   DecomposePrintfHandler H(S, FSL, FSL->getFormatString(), Type, 0, 0, IsObjC,
-                           Str, Sema::FAPK_Elsewhere, {FSL->getFormatString()},
-                           0, InFunctionCall, Sema::VariadicDoesNotApply, BV,
-                           UA, Args);
+                           Str, Sema::FAPK_Elsewhere, PrintfArgs, 0,
+                           InFunctionCall, Sema::VariadicDoesNotApply, BV, UA,
+                           Args);
 
   if (!analyze_format_string::ParsePrintfString(
           H, Str, Str + Data.size(), S.getLangOpts(), S.Context.getTargetInfo(),
@@ -7402,7 +7403,7 @@ bool DecomposePrintfHandler::GetSpecifiers(
   if (H.HadError)
     return false;
 
-  std::sort(
+  std::stable_sort(
       Args.begin(), Args.end(),
       [](const EquatableFormatArgument &A, const EquatableFormatArgument &B) {
         return A.getPosition() < B.getPosition();
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 8a77cbf..a7b609f 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -711,9 +711,32 @@ bool Sema::addInstantiatedCapturesToScope(
 
   unsigned Instantiated = 0;
 
+  // FIXME: This is a workaround for not having deferred lambda body
+  // instantiation.
+  // When transforming a lambda's body, if we encounter another call to a
+  // nested lambda that contains a constraint expression, we add all of the
+  // outer lambda's instantiated captures to the current instantiation scope to
+  // facilitate constraint evaluation. However, these captures don't appear in
+  // the CXXRecordDecl until after the lambda expression is rebuilt, so we
+  // pull them out from the corresponding LSI.
+  LambdaScopeInfo *InstantiatingScope = nullptr;
+  if (LambdaPattern->capture_size() && !LambdaClass->capture_size()) {
+    for (FunctionScopeInfo *Scope : llvm::reverse(FunctionScopes)) {
+      auto *LSI = dyn_cast<LambdaScopeInfo>(Scope);
+      if (!LSI ||
+          LSI->CallOperator->getTemplateInstantiationPattern() != PatternDecl)
+        continue;
+      InstantiatingScope = LSI;
+      break;
+    }
+    assert(InstantiatingScope);
+  }
+
   auto AddSingleCapture = [&](const ValueDecl *CapturedPattern,
                               unsigned Index) {
-    ValueDecl *CapturedVar = LambdaClass->getCapture(Index)->getCapturedVar();
+    ValueDecl *CapturedVar =
+        InstantiatingScope ? InstantiatingScope->Captures[Index].getVariable()
+                           : LambdaClass->getCapture(Index)->getCapturedVar();
     assert(CapturedVar->isInitCapture());
     Scope.InstantiatedLocal(CapturedPattern, CapturedVar);
   };
diff --git a/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
index 8955cb2..12bf12a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
@@ -314,7 +314,6 @@ void MacOSKeychainAPIChecker::checkPreStmt(const CallExpr *CE,
     RegionArgIsBad = true;
   }
 
-  assert(ArgSM);
   // Is the argument to the call being tracked?
   const AllocationState *AS = State->get<AllocatedData>(ArgSM);
   if (!AS)
diff --git a/clang/test/AST/ByteCode/arrays.cpp b/clang/test/AST/ByteCode/arrays.cpp
index 9204179..2ef0cf88 100644
--- a/clang/test/AST/ByteCode/arrays.cpp
+++ b/clang/test/AST/ByteCode/arrays.cpp
@@ -662,3 +662,16 @@ namespace InvalidIndex {
   }
   static_assert(foo(0) == 1, "");
 }
+
+namespace PointerSubscript {
+  template<typename T>
+  constexpr T foo() {
+    T ss[] = {{}, {}, {}};
+    T *s = &ss[0];
+
+    return s[2];
+  }
+  static_assert(foo<int>() == 0);
+  struct S{};
+  static_assert((foo<S>(), true));
+}
diff --git a/clang/test/AST/ByteCode/libcxx/pointer-subscript.cpp b/clang/test/AST/ByteCode/libcxx/pointer-subscript.cpp
new file mode 100644
index 0000000..68e25dc
--- /dev/null
+++ b/clang/test/AST/ByteCode/libcxx/pointer-subscript.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -std=c++2c -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -std=c++2c  -verify=ref,both %s
+
+// both-no-diagnostics
+
+namespace std {
+inline namespace __1 {
+template <class _Tp> class unique_ptr;
+template <class _Tp> class unique_ptr<_Tp[]> {
+public:
+  _Tp* __ptr_;
+
+public:
+  constexpr _Tp&
+  operator[](unsigned i) const {
+    return __ptr_[i];
+  };
+};
+} // namespace __1
+} // namespace std
+struct WithTrivialDtor {
+  int x = 6;
+  constexpr friend void operator==(WithTrivialDtor const &x,
+                                   WithTrivialDtor const &y) {
+    (void)(x.x == y.x);
+  }
+};
+constexpr bool test() {
+
+  WithTrivialDtor array[50];
+  std::unique_ptr<WithTrivialDtor[]> p(&array[0]);
+  (void)(p[1] == WithTrivialDtor());
+
+  return true;
+}
+static_assert(test());
diff --git a/clang/test/AST/ByteCode/unions.cpp b/clang/test/AST/ByteCode/unions.cpp
index c6b5e34..2064cae 100644
--- a/clang/test/AST/ByteCode/unions.cpp
+++ b/clang/test/AST/ByteCode/unions.cpp
@@ -485,4 +485,23 @@ namespace IFD {
   }
   static_assert(test());
 }
+
+namespace AnonymousUnion {
+  struct A {
+    int x;
+    union { int p, q; };
+  };
+  union B {
+    A a;
+    int bb;
+  };
+
+  constexpr B return_init_all() {
+    B b = {.bb = 1};
+    b.a.x = 2;
+    return b;
+  }
+  static_assert(return_init_all().a.p == 7); // both-error {{}} \
+                                             // both-note {{read of member 'p' of union with no active member}}
+}
 #endif
diff --git a/clang/test/CIR/func-simple.cpp b/clang/test/CIR/func-simple.cpp
index 10c49bc..22c120d 100644
--- a/clang/test/CIR/func-simple.cpp
+++ b/clang/test/CIR/func-simple.cpp
@@ -51,3 +51,9 @@ unsigned long long ullfunc() { return 42ull; }
 // CHECK:   %0 = cir.const #cir.int<42> : !cir.int<u, 64>
 // CHECK:   cir.return %0 : !cir.int<u, 64>
 // CHECK: }
+
+bool boolfunc() { return true; }
+// CHECK: cir.func @boolfunc() -> !cir.bool {
+// CHECK:   %0 = cir.const #true
+// CHECK:   cir.return %0 : !cir.bool
+// CHECK: }
diff --git a/clang/test/CIR/global-var-simple.cpp b/clang/test/CIR/global-var-simple.cpp
index 237070a..dfe8371 100644
--- a/clang/test/CIR/global-var-simple.cpp
+++ b/clang/test/CIR/global-var-simple.cpp
@@ -58,6 +58,9 @@ _BitInt(20) sb20;
 unsigned _BitInt(48) ub48;
 // CHECK: cir.global @ub48 : !cir.int<u, 48>
 
+bool boolfalse = false;
+// CHECK: cir.global @boolfalse = #false
+
 _Float16 f16;
 // CHECK: cir.global @f16 : !cir.f16
 
diff --git a/clang/test/CodeGenCXX/builtins-eh-wasm.cpp b/clang/test/CodeGenCXX/builtins-eh-wasm.cpp
new file mode 100644
index 0000000..b0f763d
--- /dev/null
+++ b/clang/test/CodeGenCXX/builtins-eh-wasm.cpp
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple wasm32-unknown-unknown -fexceptions -fcxx-exceptions -target-feature +reference-types -target-feature +exception-handling -target-feature +multivalue -exception-model=wasm -emit-llvm -o - %s | FileCheck %s
+
+// Check if __builtin_wasm_throw and __builtin_wasm_rethrow are correctly
+// invoked when placed in try-catch.
+
+void throw_in_try(void *obj) {
+  try {
+    __builtin_wasm_throw(0, obj);
+  } catch (...) {
+  }
+  // CHECK: invoke void @llvm.wasm.throw(i32 0, ptr %{{.*}})
+}
+
+void rethrow_in_try() {
+  try {
+  __builtin_wasm_rethrow();
+  } catch (...) {
+  }
+  // CHECK: invoke void @llvm.wasm.rethrow()
+}
diff --git a/clang/test/CodeGenCXX/merge-functions.cpp b/clang/test/CodeGenCXX/merge-functions.cpp
index 892234c..d3afd80 100644
--- a/clang/test/CodeGenCXX/merge-functions.cpp
+++ b/clang/test/CodeGenCXX/merge-functions.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -O0 -fmerge-functions -emit-llvm -o - -x c++ < %s | FileCheck %s -implicit-check-not=_ZN1A1gEiPi
-// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -O1 -fmerge-functions -emit-llvm -o - -x c++ < %s | FileCheck %s -implicit-check-not=_ZN1A1gEiPi
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -O0 -fmerge-functions -emit-llvm -o - -x c++ < %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -O1 -fmerge-functions -emit-llvm -o - -x c++ < %s | FileCheck %s
 
 // Basic functionality test. Function merging doesn't kick in on functions that
 // are too simple.
@@ -10,4 +10,8 @@ struct A {
   virtual int g(int x, int *p) { return x ? *p : 1; }
 } a;
 
-// CHECK: define {{.*}} @_ZN1A1fEiPi
+// CHECK: define linkonce_odr noundef i32 @_ZN1A1gEiPi(
+// CHECK:   tail call noundef i32 @0(
+
+// CHECK: define linkonce_odr noundef i32 @_ZN1A1fEiPi(
+// CHECK:   tail call noundef i32 @0(
diff --git a/clang/test/Headers/cpuid.c b/clang/test/Headers/cpuid.c
index 6ed12ec..1c4f29d 100644
--- a/clang/test/Headers/cpuid.c
+++ b/clang/test/Headers/cpuid.c
@@ -4,9 +4,9 @@
 #include <cpuid.h>
 #include <cpuid.h> // Make sure multiple inclusion protection works.
 
-// CHECK-64: {{.*}} call { i32, i32, i32, i32 } asm "  xchgq  %rbx,${1:q}\0A cpuid\0A xchgq %rbx,${1:q}", "={ax},=r,={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 %{{[a-z0-9]+}})
-// CHECK-64: {{.*}} call { i32, i32, i32, i32 } asm "  xchgq  %rbx,${1:q}\0A  cpuid\0A  xchgq  %rbx,${1:q}", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 %{{[a-z0-9]+}}, i32 %{{[a-z0-9]+}})
-// CHECK-64: {{.*}} call { i32, i32, i32, i32 } asm "  xchgq  %rbx,${1:q}\0A  cpuid\0A  xchgq  %rbx,${1:q}", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 %{{[a-z0-9]+}}, i32 %{{[a-z0-9]+}})
+// CHECK-64: {{.*}} call { i32, i32, i32, i32 } asm "  xchg$(q$|$)  $(%$|$)rbx,${1:q}\0A  cpuid\0A  xchg$(q$|$)  $(%$|$)rbx,${1:q}", "={ax},=r,={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 %{{[a-z0-9]+}})
+// CHECK-64: {{.*}} call { i32, i32, i32, i32 } asm "  xchg$(q$|$)  $(%$|$)rbx,${1:q}\0A  cpuid\0A  xchg$(q$|$)  $(%$|$)rbx,${1:q}", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 %{{[a-z0-9]+}}, i32 %{{[a-z0-9]+}})
+// CHECK-64: {{.*}} call { i32, i32, i32, i32 } asm "  xchg$(q$|$)  $(%$|$)rbx,${1:q}\0A  cpuid\0A  xchg$(q$|$)  $(%$|$)rbx,${1:q}", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 %{{[a-z0-9]+}}, i32 %{{[a-z0-9]+}})
 
 // CHECK-32: {{.*}} call { i32, i32, i32, i32 } asm "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 %{{[a-z0-9]+}})
 // CHECK-32: {{.*}} call { i32, i32, i32, i32 } asm "cpuid", "={ax},={bx},={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 %{{[a-z0-9]+}}, i32 %{{[a-z0-9]+}})
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-function-attr.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-function-attr.cpp
index 724d4446..f3abe87 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-function-attr.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-function-attr.cpp
@@ -119,16 +119,15 @@ struct HoldsUnsafeMembers {
 
     [[clang::unsafe_buffer_usage]]
     HoldsUnsafeMembers(int i)
-        : FromCtor(i),  // expected-warning{{function introduces unsafe buffer manipulation}}
-          FromCtor2{i}  // expected-warning{{function introduces unsafe buffer manipulation}}
-    {}
+        : FromCtor(i),
+          FromCtor2{i} {}
 
     HoldsUnsafeMembers(float f)
         : HoldsUnsafeMembers(0) {}  // expected-warning{{function introduces unsafe buffer manipulation}}
 
     UnsafeMembers FromCtor;
     UnsafeMembers FromCtor2;
-    UnsafeMembers FromField{3};  // expected-warning 2{{function introduces unsafe buffer manipulation}}
+    UnsafeMembers FromField{3};  // expected-warning {{function introduces unsafe buffer manipulation}}
 };
 
 struct SubclassUnsafeMembers : public UnsafeMembers {
@@ -138,8 +137,7 @@ struct SubclassUnsafeMembers : public UnsafeMembers {
 
     [[clang::unsafe_buffer_usage]]
     SubclassUnsafeMembers(int i)
-        : UnsafeMembers(i)  // expected-warning{{function introduces unsafe buffer manipulation}}
-    {}
+        : UnsafeMembers(i){}
 };
 
 // https://github.com/llvm/llvm-project/issues/80482
@@ -245,3 +243,68 @@ struct AggregateViaDefaultInit {
 void testAggregateViaDefaultInit() {
     AggregateViaDefaultInit A;
 };
+
+struct A {
+  int arr[2];
+
+  [[clang::unsafe_buffer_usage]]
+  int *ptr;
+};
+
+namespace std{
+  template <typename T> class span {
+
+   T *elements;
+
+   public:
+
+   constexpr span(T *, unsigned){}
+
+   template<class Begin, class End>
+   constexpr span(Begin first, End last){}
+
+   constexpr T* data() const noexcept {
+     return elements;
+   }
+ };
+}
+
+[[clang::unsafe_buffer_usage]]
+void check_no_warnings(unsigned idx) {
+  int *arr = new int[20];
+
+  int k = arr[idx]; // no-warning
+
+  std::span<int> sp = {arr, 20}; // no-warning
+  A *ptr = reinterpret_cast<A*> (sp.data()); // no-warning
+  A a;
+  a.ptr = arr; // no-warning
+}
+
+[[clang::unsafe_buffer_usage]]
+void check_no_warning_variadic(unsigned idx, int arr[20], ...) {
+  int k = arr[idx]; // no-warning
+
+  std::span<int> sp = {arr, 20}; // no-warning
+  A *ptr = reinterpret_cast<A*> (sp.data()); // no-warning
+  A a;
+  a.ptr = arr; // no-warning
+}
+
+template<typename T>
+[[clang::unsafe_buffer_usage]]
+void check_no_warnings_template(unsigned idx, T* arr) {
+  int k = arr[idx]; // no-warning
+
+  std::span<int> sp = {arr, 20}; // no-warning
+  A *ptr = reinterpret_cast<A*> (sp.data()); // no-warning
+  A a;
+  a.ptr = arr; // no-warning
+}
+
+void invoke_methods() {
+  int array[20];
+  check_no_warnings(30); //expected-warning{{function introduces unsafe buffer manipulation}}
+  check_no_warning_variadic(15, array); //expected-warning{{function introduces unsafe buffer manipulation}}
+  check_no_warnings_template(10, array); //expected-warning{{function introduces unsafe buffer manipulation}}
+}
diff --git a/clang/test/SemaTemplate/concepts-lambda.cpp b/clang/test/SemaTemplate/concepts-lambda.cpp
index 306f86c..dcb09c7 100644
--- a/clang/test/SemaTemplate/concepts-lambda.cpp
+++ b/clang/test/SemaTemplate/concepts-lambda.cpp
@@ -307,3 +307,21 @@ void test() {
 }
 
 }
+
+namespace GH128175 {
+
+template <class> void f() {
+  [i{0}] {
+    [&] {
+      [&] {
+        []()
+          requires true
+        {}();
+      }();
+    }();
+  }();
+}
+
+template void f<int>();
+
+}
diff --git a/clang/utils/creduce-clang-crash.py b/clang/utils/reduce-clang-crash.py
index 180dfbe..22e3dbb 100755
--- a/clang/utils/creduce-clang-crash.py
+++ b/clang/utils/reduce-clang-crash.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python3
-"""Calls C-Reduce to create a minimal reproducer for clang crashes.
-Unknown arguments are treated at creduce options.
+"""Calls reduction tools to create minimal reproducers for clang crashes.
+
+Unknown arguments are treated at cvise/creduce options.
 
 Output files:
   *.reduced.sh -- crash reproducer with minimal arguments
   *.reduced.cpp -- the reduced file
-  *.test.sh -- interestingness test for C-Reduce
+  *.test.sh -- interestingness test for C-Vise
 """
 
 from argparse import ArgumentParser, RawTextHelpFormatter
@@ -311,7 +312,7 @@ fi
         return args, index + 1
 
     def simplify_clang_args(self):
-        """Simplify clang arguments before running C-Reduce to reduce the time the
+        """Simplify clang arguments before running C-Vise to reduce the time the
         interestingness test takes to run.
         """
         print("\nSimplifying the clang command...")
@@ -370,7 +371,7 @@ fi
         verbose_print("Simplified command:", quote_cmd(self.get_crash_cmd()))
 
     def reduce_clang_args(self):
-        """Minimize the clang arguments after running C-Reduce, to get the smallest
+        """Minimize the clang arguments after running C-Vise, to get the smallest
         command that reproduces the crash on the reduced file.
         """
         print("\nReducing the clang crash command...")
@@ -413,14 +414,14 @@ fi
         full_creduce_cmd = (
             [creduce_cmd] + self.creduce_flags + [self.testfile, self.file_to_reduce]
         )
-        print("\nRunning C-Reduce...")
+        print("\nRunning C reduction tool...")
         verbose_print(quote_cmd(full_creduce_cmd))
         try:
             p = subprocess.Popen(full_creduce_cmd)
             p.communicate()
         except KeyboardInterrupt:
             # Hack to kill C-Reduce because it jumps into its own pgid
-            print("\n\nctrl-c detected, killed creduce")
+            print("\n\nctrl-c detected, killed reduction tool")
             p.kill()
 
 
@@ -453,14 +454,15 @@ def main():
         "--creduce",
         dest="creduce",
         type=str,
-        help="The path to the `creduce` executable. "
-        "Required if `creduce` is not in PATH environment.",
+        help="The path to the `creduce` or `cvise` executable. "
+        "Required if neither `creduce` nor `cvise` are on PATH.",
     )
     parser.add_argument("-v", "--verbose", action="store_true")
     args, creduce_flags = parser.parse_known_args()
     verbose = args.verbose
     llvm_bin = os.path.abspath(args.llvm_bin) if args.llvm_bin else None
     creduce_cmd = check_cmd("creduce", None, args.creduce)
+    creduce_cmd = check_cmd("cvise", None, args.creduce)
     clang_cmd = check_cmd("clang", llvm_bin, args.clang)
 
     crash_script = check_file(args.crash_script[0])
diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt
index 76e612a..9f890a0 100644
--- a/flang-rt/CMakeLists.txt
+++ b/flang-rt/CMakeLists.txt
@@ -96,7 +96,7 @@ if (LLVM_TREE_AVAILABLE)
   # being added to the build. Flang uses the same resource dir as clang.
   include(GetClangResourceDir)
   get_clang_resource_dir(FLANG_RT_OUTPUT_RESOURCE_DIR PREFIX "${LLVM_LIBRARY_OUTPUT_INTDIR}/..")
-  get_clang_resource_dir(FLANG_RT_INSTALL_RESOURCE_PATH)
+  get_clang_resource_dir(FLANG_RT_INSTALL_RESOURCE_PATH_DEFAULT)
 
   extend_path(FLANG_RT_OUTPUT_RESOURCE_LIB_DIR "${FLANG_RT_OUTPUT_RESOURCE_DIR}" "${toolchain_lib_subdir}")
 else ()
@@ -108,10 +108,12 @@ else ()
   # be installed there using the same prefix. This is to not have a difference
   # between bootstrap and standalone runtimes builds.
   set(FLANG_RT_OUTPUT_RESOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-  set(FLANG_RT_INSTALL_RESOURCE_PATH "lib${LLVM_LIBDIR_SUFFIX}/clang/${LLVM_VERSION_MAJOR}")
+  set(FLANG_RT_INSTALL_RESOURCE_PATH_DEFAULT "lib${LLVM_LIBDIR_SUFFIX}/clang/${LLVM_VERSION_MAJOR}")
 
   extend_path(FLANG_RT_OUTPUT_RESOURCE_LIB_DIR "${FLANG_RT_OUTPUT_RESOURCE_DIR}" "lib${LLVM_LIBDIR_SUFFIX}")
 endif ()
+set(FLANG_RT_INSTALL_RESOURCE_PATH "${FLANG_RT_INSTALL_RESOURCE_PATH_DEFAULT}"
+    CACHE PATH "Path to install runtime libraries to (default: clang resource dir)")
 extend_path(FLANG_RT_INSTALL_RESOURCE_LIB_PATH "${FLANG_RT_INSTALL_RESOURCE_PATH}" "${toolchain_lib_subdir}")
 cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_DIR)
 cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_PATH)
diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
index b0a6322..5f351fb 100644
--- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
+++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
@@ -143,6 +143,10 @@ std::string OpenMPCounterVisitor::getName(const OpenMPConstruct &c) {
             const CharBlock &source{std::get<0>(c.t).source};
             return normalize_construct_name(source.ToString());
           },
+          [&](const OpenMPAssumeConstruct &c) -> std::string {
+            const CharBlock &source{std::get<0>(c.t).source};
+            return normalize_construct_name(source.ToString());
+          },
           [&](const OpenMPAllocatorsConstruct &c) -> std::string {
             const CharBlock &source{std::get<0>(c.t).source};
             return normalize_construct_name(source.ToString());
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 75c1130..30904a68 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -510,6 +510,7 @@ public:
   NODE_ENUM(OmpMapTypeModifier, Value)
   NODE(parser, OmpIteratorSpecifier)
   NODE(parser, OmpIterator)
+  NODE(parser, OmpAbsentClause)
   NODE(parser, OmpAffinityClause)
   NODE(OmpAffinityClause, Modifier)
   NODE(parser, OmpAlignment)
@@ -543,6 +544,7 @@ public:
 #define GEN_FLANG_DUMP_PARSE_TREE_CLAUSES
 #include "llvm/Frontend/OpenMP/OMP.inc"
   NODE(parser, OmpClauseList)
+  NODE(parser, OmpContainsClause)
   NODE(parser, OmpCriticalDirective)
   NODE(parser, OmpErrorDirective)
   NODE(parser, OmpNothingDirective)
@@ -585,6 +587,7 @@ public:
   NODE(parser, OmpExpectation)
   NODE_ENUM(OmpExpectation, Value)
   NODE(parser, OmpDirectiveNameModifier)
+  NODE(parser, OmpHoldsClause)
   NODE(parser, OmpIfClause)
   NODE(OmpIfClause, Modifier)
   NODE(parser, OmpLastprivateClause)
@@ -608,6 +611,9 @@ public:
   }
   NODE(parser, OmpObject)
   NODE(parser, OmpObjectList)
+  NODE(parser, OmpNoOpenMPClause)
+  NODE(parser, OmpNoOpenMPRoutinesClause)
+  NODE(parser, OmpNoParallelismClause)
   NODE(parser, OmpOrderClause)
   NODE(OmpOrderClause, Modifier)
   NODE_ENUM(OmpOrderClause, Ordering)
@@ -672,6 +678,10 @@ public:
   NODE(parser, OpenACCStandaloneDeclarativeConstruct)
   NODE(parser, OpenACCStandaloneConstruct)
   NODE(parser, OpenACCWaitConstruct)
+  NODE(parser, OpenMPAssumeConstruct)
+  NODE(parser, OpenMPDeclarativeAssumes)
+  NODE(parser, OmpAssumeDirective)
+  NODE(parser, OmpEndAssumeDirective)
   NODE(parser, OpenMPAtomicConstruct)
   NODE(parser, OpenMPBlockConstruct)
   NODE(parser, OpenMPCancelConstruct)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index dafe46f..d3b3d69 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3953,6 +3953,16 @@ using OmpContextSelector = traits::OmpContextSelectorSpecification;
 
 // --- Clauses
 
+using OmpDirectiveList = std::list<llvm::omp::Directive>;
+
+// Ref: [5.2:214]
+//
+// absent-clause ->
+//   ABSENT(directive-name[, directive-name])
+struct OmpAbsentClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpAbsentClause, OmpDirectiveList);
+};
+
 // Ref: [5.0:135-140], [5.1:161-166], [5.2:264-265]
 //
 // affinity-clause ->
@@ -4026,6 +4036,14 @@ struct OmpBindClause {
   WRAPPER_CLASS_BOILERPLATE(OmpBindClause, Binding);
 };
 
+// Ref: [5.2:214]
+//
+// contains-clause ->
+//   CONTAINS(directive-name[, directive-name])
+struct OmpContainsClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpContainsClause, OmpDirectiveList);
+};
+
 // Ref: [4.5:46-50], [5.0:74-78], [5.1:92-96], [5.2:109]
 //
 // When used as a data-sharing clause:
@@ -4198,6 +4216,14 @@ struct OmpGrainsizeClause {
   std::tuple<MODIFIERS(), ScalarIntExpr> t;
 };
 
+// Ref: [5.2: 214]
+//
+// holds-clause ->
+//   HOLDS(expr)
+struct OmpHoldsClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpHoldsClause, common::Indirection<Expr>);
+};
+
 // Ref: [5.2:72-73], in 4.5-5.1 it's scattered over individual directives
 // that allow the IF clause.
 //
@@ -4279,6 +4305,21 @@ struct OmpMessageClause {
   WRAPPER_CLASS_BOILERPLATE(OmpMessageClause, Expr);
 };
 
+// Ref: [5.2: 214]
+//
+// no_openmp_clause -> NO_OPENMP
+EMPTY_CLASS(OmpNoOpenMPClause);
+
+// Ref: [5.2: 214]
+//
+// no_openmp_routines_clause -> NO_OPENMP_ROUTINES
+EMPTY_CLASS(OmpNoOpenMPRoutinesClause);
+
+// Ref: [5.2: 214]
+//
+// no_parallelism_clause -> NO_PARALELISM
+EMPTY_CLASS(OmpNoParallelismClause);
+
 // Ref: [4.5:87-91], [5.0:140-146], [5.1:166-171], [5.2:270]
 //
 // num-tasks-clause ->
@@ -4473,6 +4514,41 @@ struct OpenMPUtilityConstruct {
   std::variant<OmpErrorDirective, OmpNothingDirective> u;
 };
 
+// Ref: [5.2: 213-216]
+//
+// assumes-construct ->
+//   ASSUMES absent-clause | contains-clause | holds-clause | no-openmp-clause |
+//          no-openmp-routines-clause | no-parallelism-clause
+struct OpenMPDeclarativeAssumes {
+  TUPLE_CLASS_BOILERPLATE(OpenMPDeclarativeAssumes);
+  std::tuple<Verbatim, OmpClauseList> t;
+  CharBlock source;
+};
+
+struct OmpAssumeDirective {
+  TUPLE_CLASS_BOILERPLATE(OmpAssumeDirective);
+  std::tuple<Verbatim, OmpClauseList> t;
+  CharBlock source;
+};
+
+struct OmpEndAssumeDirective {
+  WRAPPER_CLASS_BOILERPLATE(OmpEndAssumeDirective, Verbatim);
+  CharBlock source;
+};
+
+// Ref: [5.2: 213-216]
+//
+// assume-construct ->
+//   ASSUME absent-clause | contains-clause | holds_clause | no-openmp-clause
+//          no-openmp-routines-clause | no-parallelism-clause
+//       block
+//   [END ASSUME]
+struct OpenMPAssumeConstruct {
+  TUPLE_CLASS_BOILERPLATE(OpenMPAssumeConstruct);
+  std::tuple<OmpAssumeDirective, Block, std::optional<OmpEndAssumeDirective>> t;
+  CharBlock source;
+};
+
 // 2.7.2 SECTIONS
 // 2.11.2 PARALLEL SECTIONS
 struct OmpSectionsDirective {
@@ -4595,10 +4671,11 @@ struct OpenMPDeclarativeAllocate {
 struct OpenMPDeclarativeConstruct {
   UNION_CLASS_BOILERPLATE(OpenMPDeclarativeConstruct);
   CharBlock source;
-  std::variant<OpenMPDeclarativeAllocate, OpenMPDeclareMapperConstruct,
-      OpenMPDeclareReductionConstruct, OpenMPDeclareSimdConstruct,
+  std::variant<OpenMPDeclarativeAllocate, OpenMPDeclarativeAssumes,
+      OpenMPDeclareMapperConstruct, OpenMPDeclareReductionConstruct,
+      OpenMPDeclareSimdConstruct, OpenMPDeclareTargetConstruct,
       OpenMPThreadprivate, OpenMPRequiresConstruct, OpenMPUtilityConstruct,
-      OpenMPDeclareTargetConstruct, OmpMetadirectiveDirective>
+      OmpMetadirectiveDirective>
       u;
 };
 
@@ -4885,7 +4962,7 @@ struct OpenMPConstruct {
       OpenMPSectionConstruct, OpenMPLoopConstruct, OpenMPBlockConstruct,
       OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPDispatchConstruct,
       OpenMPUtilityConstruct, OpenMPExecutableAllocate,
-      OpenMPAllocatorsConstruct, OpenMPCriticalConstruct>
+      OpenMPAllocatorsConstruct, OpenMPAssumeConstruct, OpenMPCriticalConstruct>
       u;
 };
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index e0d23fc..97b4778 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -367,6 +367,9 @@ extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) {
           [](const parser::OpenMPAllocatorsConstruct &c) {
             return llvm::omp::OMPD_allocators;
           },
+          [](const parser::OpenMPAssumeConstruct &c) {
+            return llvm::omp::OMPD_assume;
+          },
           [](const parser::OpenMPAtomicConstruct &c) {
             return llvm::omp::OMPD_atomic;
           },
@@ -562,8 +565,11 @@ static void processHostEvalClauses(lower::AbstractConverter &converter,
       [[fallthrough]];
     case OMPD_distribute_parallel_do:
     case OMPD_distribute_parallel_do_simd:
-      cp.processCollapse(loc, eval, hostInfo.ops, hostInfo.iv);
       cp.processNumThreads(stmtCtx, hostInfo.ops);
+      [[fallthrough]];
+    case OMPD_distribute:
+    case OMPD_distribute_simd:
+      cp.processCollapse(loc, eval, hostInfo.ops, hostInfo.iv);
       break;
 
     // Cases where 'teams' clauses might be present, and target SPMD is
@@ -573,10 +579,8 @@ static void processHostEvalClauses(lower::AbstractConverter &converter,
       [[fallthrough]];
     case OMPD_target_teams:
       cp.processNumTeams(stmtCtx, hostInfo.ops);
-      processSingleNestedIf([](Directive nestedDir) {
-        return nestedDir == OMPD_distribute_parallel_do ||
-               nestedDir == OMPD_distribute_parallel_do_simd;
-      });
+      processSingleNestedIf(
+          [](Directive nestedDir) { return topDistributeSet.test(nestedDir); });
       break;
 
     // Cases where only 'teams' host-evaluated clauses might be present.
@@ -586,6 +590,7 @@ static void processHostEvalClauses(lower::AbstractConverter &converter,
       [[fallthrough]];
     case OMPD_target_teams_distribute:
     case OMPD_target_teams_distribute_simd:
+      cp.processCollapse(loc, eval, hostInfo.ops, hostInfo.iv);
       cp.processNumTeams(stmtCtx, hostInfo.ops);
       break;
 
@@ -3101,6 +3106,13 @@ genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate");
 }
 
+static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
+                   semantics::SemanticsContext &semaCtx,
+                   lower::pft::Evaluation &eval,
+                   const parser::OpenMPDeclarativeAssumes &assumesConstruct) {
+  TODO(converter.getCurrentLocation(), "OpenMP ASSUMES declaration");
+}
+
 static void genOMP(
     lower::AbstractConverter &converter, lower::SymMap &symTable,
     semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
@@ -3448,6 +3460,14 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
+                   const parser::OpenMPAssumeConstruct &assumeConstruct) {
+  mlir::Location clauseLocation = converter.genLocation(assumeConstruct.source);
+  TODO(clauseLocation, "OpenMP ASSUME construct");
+}
+
+static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
+                   semantics::SemanticsContext &semaCtx,
+                   lower::pft::Evaluation &eval,
                    const parser::OpenMPCriticalConstruct &criticalConstruct) {
   const auto &cd = std::get<parser::OmpCriticalDirective>(criticalConstruct.t);
   List<Clause> clauses =
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index b39b873..014b4f8 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -773,7 +773,14 @@ TYPE_PARSER(construct<OmpSeverityClause>(
 
 TYPE_PARSER(construct<OmpMessageClause>(expr))
 
-TYPE_PARSER(
+TYPE_PARSER(construct<OmpHoldsClause>(indirect(expr)))
+TYPE_PARSER(construct<OmpAbsentClause>(many(maybe(","_tok) >>
+    construct<llvm::omp::Directive>(OmpDirectiveNameParser{}))))
+TYPE_PARSER(construct<OmpContainsClause>(many(maybe(","_tok) >>
+    construct<llvm::omp::Directive>(OmpDirectiveNameParser{}))))
+
+TYPE_PARSER("ABSENT" >> construct<OmpClause>(construct<OmpClause::Absent>(
+                            parenthesized(Parser<OmpAbsentClause>{}))) ||
     "ACQUIRE" >> construct<OmpClause>(construct<OmpClause::Acquire>()) ||
     "ACQ_REL" >> construct<OmpClause>(construct<OmpClause::AcqRel>()) ||
     "AFFINITY" >> construct<OmpClause>(construct<OmpClause::Affinity>(
@@ -795,6 +802,8 @@ TYPE_PARSER(
                   parenthesized(Parser<OmpBindClause>{}))) ||
     "COLLAPSE" >> construct<OmpClause>(construct<OmpClause::Collapse>(
                       parenthesized(scalarIntConstantExpr))) ||
+    "CONTAINS" >> construct<OmpClause>(construct<OmpClause::Contains>(
+                      parenthesized(Parser<OmpContainsClause>{}))) ||
     "COPYIN" >> construct<OmpClause>(construct<OmpClause::Copyin>(
                     parenthesized(Parser<OmpObjectList>{}))) ||
     "COPYPRIVATE" >> construct<OmpClause>(construct<OmpClause::Copyprivate>(
@@ -839,6 +848,8 @@ TYPE_PARSER(
             parenthesized(Parser<OmpObjectList>{}))) ||
     "HINT" >> construct<OmpClause>(
                   construct<OmpClause::Hint>(parenthesized(constantExpr))) ||
+    "HOLDS" >> construct<OmpClause>(construct<OmpClause::Holds>(
+                   parenthesized(Parser<OmpHoldsClause>{}))) ||
     "IF" >> construct<OmpClause>(construct<OmpClause::If>(
                 parenthesized(Parser<OmpIfClause>{}))) ||
     "INBRANCH" >> construct<OmpClause>(construct<OmpClause::Inbranch>()) ||
@@ -869,6 +880,11 @@ TYPE_PARSER(
     "NOVARIANTS" >> construct<OmpClause>(construct<OmpClause::Novariants>(
                         parenthesized(scalarLogicalExpr))) ||
     "NOWAIT" >> construct<OmpClause>(construct<OmpClause::Nowait>()) ||
+    "NO_OPENMP"_id >> construct<OmpClause>(construct<OmpClause::NoOpenmp>()) ||
+    "NO_OPENMP_ROUTINES" >>
+        construct<OmpClause>(construct<OmpClause::NoOpenmpRoutines>()) ||
+    "NO_PARALLELISM" >>
+        construct<OmpClause>(construct<OmpClause::NoParallelism>()) ||
     "NUM_TASKS" >> construct<OmpClause>(construct<OmpClause::NumTasks>(
                        parenthesized(Parser<OmpNumTasksClause>{}))) ||
     "NUM_TEAMS" >> construct<OmpClause>(construct<OmpClause::NumTeams>(
@@ -1299,28 +1315,45 @@ TYPE_PARSER(
         parenthesized(Parser<OmpObjectList>{}), Parser<OmpClauseList>{})) /
     lookAhead(endOmpLine / !statement(allocateStmt)))
 
+// Assumes Construct
+TYPE_PARSER(sourced(construct<OpenMPDeclarativeAssumes>(
+    verbatim("ASSUMES"_tok), Parser<OmpClauseList>{})))
+
 // Declarative constructs
-TYPE_PARSER(startOmpLine >>
-    withMessage("expected OpenMP construct"_err_en_US,
-        sourced(construct<OpenMPDeclarativeConstruct>(
-                    Parser<OpenMPDeclareReductionConstruct>{}) ||
-            construct<OpenMPDeclarativeConstruct>(
-                Parser<OpenMPDeclareMapperConstruct>{}) ||
-            construct<OpenMPDeclarativeConstruct>(
-                Parser<OpenMPDeclareSimdConstruct>{}) ||
-            construct<OpenMPDeclarativeConstruct>(
-                Parser<OpenMPDeclareTargetConstruct>{}) ||
-            construct<OpenMPDeclarativeConstruct>(
-                Parser<OpenMPDeclarativeAllocate>{}) ||
-            construct<OpenMPDeclarativeConstruct>(
-                Parser<OpenMPRequiresConstruct>{}) ||
-            construct<OpenMPDeclarativeConstruct>(
-                Parser<OpenMPThreadprivate>{}) ||
-            construct<OpenMPDeclarativeConstruct>(
-                Parser<OpenMPUtilityConstruct>{}) ||
-            construct<OpenMPDeclarativeConstruct>(
-                Parser<OmpMetadirectiveDirective>{})) /
-            endOmpLine))
+TYPE_PARSER(
+    startOmpLine >> withMessage("expected OpenMP construct"_err_en_US,
+                        sourced(construct<OpenMPDeclarativeConstruct>(
+                                    Parser<OpenMPDeclarativeAssumes>{}) ||
+                            construct<OpenMPDeclarativeConstruct>(
+                                Parser<OpenMPDeclareReductionConstruct>{}) ||
+                            construct<OpenMPDeclarativeConstruct>(
+                                Parser<OpenMPDeclareMapperConstruct>{}) ||
+                            construct<OpenMPDeclarativeConstruct>(
+                                Parser<OpenMPDeclareSimdConstruct>{}) ||
+                            construct<OpenMPDeclarativeConstruct>(
+                                Parser<OpenMPDeclareTargetConstruct>{}) ||
+                            construct<OpenMPDeclarativeConstruct>(
+                                Parser<OpenMPDeclarativeAllocate>{}) ||
+                            construct<OpenMPDeclarativeConstruct>(
+                                Parser<OpenMPRequiresConstruct>{}) ||
+                            construct<OpenMPDeclarativeConstruct>(
+                                Parser<OpenMPThreadprivate>{}) ||
+                            construct<OpenMPDeclarativeConstruct>(
+                                Parser<OpenMPUtilityConstruct>{}) ||
+                            construct<OpenMPDeclarativeConstruct>(
+                                Parser<OmpMetadirectiveDirective>{})) /
+                            endOmpLine))
+
+// Assume Construct
+TYPE_PARSER(sourced(construct<OmpAssumeDirective>(
+    verbatim("ASSUME"_tok), Parser<OmpClauseList>{})))
+
+TYPE_PARSER(sourced(construct<OmpEndAssumeDirective>(
+    verbatim(startOmpLine >> "END ASSUME"_tok))))
+
+TYPE_PARSER(sourced(
+    construct<OpenMPAssumeConstruct>(Parser<OmpAssumeDirective>{} / endOmpLine,
+        block, maybe(Parser<OmpEndAssumeDirective>{} / endOmpLine))))
 
 // Block Construct
 TYPE_PARSER(construct<OpenMPBlockConstruct>(
@@ -1369,6 +1402,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
                 construct<OpenMPConstruct>(Parser<OpenMPExecutableAllocate>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAllocatorsConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPDeclarativeAllocate>{}),
+                construct<OpenMPConstruct>(Parser<OpenMPAssumeConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPCriticalConstruct>{}))))
 
 // END OMP Block directives
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 6260a01..960337b 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2179,6 +2179,8 @@ public:
     Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
     Walk(std::get<ScalarIntExpr>(x.t));
   }
+  void Unparse(const OmpAbsentClause &x) { Walk("", x.v, ","); }
+  void Unparse(const OmpContainsClause &x) { Walk("", x.v, ","); }
   void Unparse(const OmpAffinityClause &x) {
     using Modifier = OmpAffinityClause::Modifier;
     Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
@@ -2662,6 +2664,18 @@ public:
       Walk(*end);
     }
   }
+  void Unparse(const OmpAssumeDirective &x) {
+    BeginOpenMP();
+    Word("!$OMP ASSUME");
+    Walk(" ", std::get<OmpClauseList>(x.t).v);
+    Put("\n");
+    EndOpenMP();
+  }
+  void Unparse(const OmpEndAssumeDirective &x) {
+    BeginOpenMP();
+    Word("!$OMP END ASSUME\n");
+    EndOpenMP();
+  }
   void Unparse(const OmpCriticalDirective &x) {
     BeginOpenMP();
     Word("!$OMP CRITICAL");
@@ -2700,7 +2714,13 @@ public:
     Put("\n");
     EndOpenMP();
   }
-
+  void Unparse(const OpenMPDeclarativeAssumes &x) {
+    BeginOpenMP();
+    Word("!$OMP ASSUMES ");
+    Walk(std::get<OmpClauseList>(x.t));
+    Put("\n");
+    EndOpenMP();
+  }
   void Unparse(const OpenMPDeclareMapperConstruct &z) {
     BeginOpenMP();
     Word("!$OMP DECLARE MAPPER (");
@@ -2868,7 +2888,9 @@ public:
     Put("\n");
     EndOpenMP();
   }
-  void Unparse(const OmpClauseList &x) { Walk(" ", x.v, " "); }
+  void Unparse(const OmpClauseList &x, const char *sep = " ") {
+    Walk(" ", x.v, sep);
+  }
   void Unparse(const OpenMPSimpleStandaloneConstruct &x) {
     BeginOpenMP();
     Word("!$OMP ");
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index ef7204d..c95cf0d 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1251,6 +1251,22 @@ void OmpStructureChecker::CheckMasterNesting(
   }
 }
 
+void OmpStructureChecker::Enter(const parser::OpenMPAssumeConstruct &x) {
+  PushContextAndClauseSets(x.source, llvm::omp::Directive::OMPD_assume);
+}
+
+void OmpStructureChecker::Leave(const parser::OpenMPAssumeConstruct &) {
+  dirContext_.pop_back();
+}
+
+void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAssumes &x) {
+  PushContextAndClauseSets(x.source, llvm::omp::Directive::OMPD_assumes);
+}
+
+void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAssumes &) {
+  dirContext_.pop_back();
+}
+
 void OmpStructureChecker::Leave(const parser::OpenMPBlockConstruct &) {
   if (GetDirectiveNest(TargetBlockOnlyTeams)) {
     ExitDirectiveNest(TargetBlockOnlyTeams);
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index a9ac93a..6327861 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -81,6 +81,10 @@ public:
   void Enter(const parser::OmpEndLoopDirective &);
   void Leave(const parser::OmpEndLoopDirective &);
 
+  void Enter(const parser::OpenMPAssumeConstruct &);
+  void Leave(const parser::OpenMPAssumeConstruct &);
+  void Enter(const parser::OpenMPDeclarativeAssumes &);
+  void Leave(const parser::OpenMPDeclarativeAssumes &);
   void Enter(const parser::OpenMPBlockConstruct &);
   void Leave(const parser::OpenMPBlockConstruct &);
   void Leave(const parser::OmpBeginBlockDirective &);
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index af8ea66..aa5eacf 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -670,4 +670,477 @@ implicit none
     end function
   end interface
 
+  ! LDCG
+  interface __ldcg
+    attributes(device) pure integer(4) function __ldcg_i4(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      integer(4), intent(in) :: x
+    end function
+    attributes(device) pure integer(8) function __ldcg_i8(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      integer(8), intent(in) :: x
+    end function
+    attributes(device) pure function __ldcg_cd(x) bind(c) result(y)
+      import c_devptr
+      !dir$ ignore_tkr (d) x
+      type(c_devptr), intent(in) :: x
+      type(c_devptr) :: y
+    end function
+    attributes(device) pure real(2) function __ldcg_r2(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(2), intent(in) :: x
+    end function
+    attributes(device) pure real(4) function __ldcg_r4(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(4), intent(in) :: x
+    end function
+    attributes(device) pure real(8) function __ldcg_r8(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(8), intent(in) :: x
+    end function
+    attributes(device) pure complex(4) function __ldcg_c4(x) &
+        bind(c,name='__ldcg_c4x')
+      !dir$ ignore_tkr (d) x
+      complex(4), intent(in) :: x
+    end function
+    attributes(device) pure complex(8) function __ldcg_c8(x) &
+        bind(c,name='__ldcg_c8x')
+      !dir$ ignore_tkr (d) x
+      complex(8), intent(in) :: x
+    end function
+    attributes(device) pure function __ldcg_i4x4(x) result(y)
+      !dir$ ignore_tkr (d) x
+      integer(4), dimension(4), intent(in) :: x
+      integer(4), dimension(4) :: y
+    end function
+    attributes(device) pure function __ldcg_i8x2(x) result(y)
+      !dir$ ignore_tkr (d) x
+      integer(8), dimension(2), intent(in) :: x
+      integer(8), dimension(2) :: y
+    end function
+    attributes(device) pure function __ldcg_r2x2(x) result(y)
+      !dir$ ignore_tkr (d) x
+      real(2), dimension(2), intent(in) :: x
+      real(2), dimension(2) :: y
+    end function
+    attributes(device) pure function __ldcg_r4x4(x) result(y)
+      !dir$ ignore_tkr (d) x
+      real(4), dimension(4), intent(in) :: x
+      real(4), dimension(4) :: y
+    end function
+    attributes(device) pure function __ldcg_r8x2(x) result(y)
+      !dir$ ignore_tkr (d) x
+      real(8), dimension(2), intent(in) :: x
+      real(8), dimension(2) :: y
+    end function
+  end interface
+
+  ! LDCA
+  interface __ldca
+    attributes(device) pure integer(4) function __ldca_i4(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      integer(4), intent(in) :: x
+    end function
+    attributes(device) pure integer(8) function __ldca_i8(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      integer(8), intent(in) :: x
+    end function
+    attributes(device) pure function __ldca_cd(x) bind(c) result(y)
+      !dir$ ignore_tkr (d) x
+      import c_devptr
+      type(c_devptr), intent(in) :: x
+      type(c_devptr) :: y
+    end function
+    attributes(device) pure real(2) function __ldca_r2(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(2), intent(in) :: x
+    end function
+    attributes(device) pure real(4) function __ldca_r4(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(4), intent(in) :: x
+      end function
+    attributes(device) pure real(8) function __ldca_r8(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(8), intent(in) :: x
+    end function
+    attributes(device) pure complex(4) function __ldca_c4(x) &
+        bind(c,name='__ldca_c4x')
+      !dir$ ignore_tkr (d) x
+      complex(4), intent(in) :: x
+    end function
+    attributes(device) pure complex(8) function __ldca_c8(x) &
+        bind(c,name='__ldca_c8x')
+      !dir$ ignore_tkr (d) x
+      complex(8), intent(in) :: x
+    end function
+  end interface
+
+  ! LDCS
+  interface __ldcs
+    attributes(device) pure integer(4) function __ldcs_i4(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      integer(4), intent(in) :: x
+    end function
+    attributes(device) pure integer(8) function __ldcs_i8(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      integer(8), intent(in) :: x
+    end function
+    attributes(device) pure function __ldcs_cd(x) bind(c) result(y)
+      import c_devptr
+      !dir$ ignore_tkr (d) x
+      type(c_devptr), intent(in) :: x
+      type(c_devptr) :: y
+    end function
+    attributes(device) pure real(2) function __ldcs_r2(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(2), intent(in) :: x
+    end function
+    attributes(device) pure real(4) function __ldcs_r4(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(4), intent(in) :: x
+    end function
+    attributes(device) pure real(8) function __ldcs_r8(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(8), intent(in) :: x
+    end function
+    attributes(device) pure complex(4) function __ldcs_c4(x) &
+        bind(c,name='__ldcs_c4x')
+      !dir$ ignore_tkr (d) x
+      complex(4), intent(in) :: x
+    end function
+    attributes(device) pure complex(8) function __ldcs_c8(x) &
+        bind(c,name='__ldcs_c8x')
+      !dir$ ignore_tkr (d) x
+      complex(8), intent(in) :: x
+    end function
+  end interface
+
+  ! LDLU
+  interface __ldlu
+    attributes(device) pure integer(4) function __ldlu_i4(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      integer(4), intent(in) :: x
+    end function
+    attributes(device) pure integer(8) function __ldlu_i8(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      integer(8), intent(in) :: x
+    end function
+    attributes(device) pure function __ldlu_cd(x) bind(c) result(y)
+      import c_devptr
+      !dir$ ignore_tkr (d) x
+      type(c_devptr), intent(in) :: x
+      type(c_devptr) :: y
+    end function
+    attributes(device) pure real(2) function __ldlu_r2(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(2), intent(in) :: x
+    end function
+    attributes(device) pure real(4) function __ldlu_r4(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(4), intent(in) :: x
+    end function
+    attributes(device) pure real(8) function __ldlu_r8(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(8), intent(in) :: x
+    end function
+    attributes(device) pure complex(4) function __ldlu_c4(x) &
+        bind(c,name='__ldlu_c4x')
+      !dir$ ignore_tkr (d) x
+      complex(4), intent(in) :: x
+    end function
+    attributes(device) pure complex(8) function __ldlu_c8(x) &
+        bind(c,name='__ldlu_c8x')
+      !dir$ ignore_tkr (d) x
+      complex(8), intent(in) :: x
+    end function
+  end interface
+
+  ! LDCV
+  interface __ldcv
+    attributes(device) pure integer(4) function __ldcv_i4(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      integer(4), intent(in) :: x
+    end function
+    attributes(device) pure integer(8) function __ldcv_i8(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      integer(8), intent(in) :: x
+    end function
+    attributes(device) pure function __ldcv_cd(x) bind(c) result(y)
+      import c_devptr
+      !dir$ ignore_tkr (d) x
+      type(c_devptr), intent(in) :: x
+      type(c_devptr) :: y
+    end function
+    attributes(device) pure real(2) function __ldcv_r2(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(2), intent(in) :: x
+    end function
+    attributes(device) pure real(4) function __ldcv_r4(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(4), intent(in) :: x
+    end function
+    attributes(device) pure real(8) function __ldcv_r8(x) bind(c)
+      !dir$ ignore_tkr (d) x
+      real(8), intent(in) :: x
+    end function
+    attributes(device) pure complex(4) function __ldcv_c4(x) &
+        bind(c,name='__ldcv_c4x')
+      !dir$ ignore_tkr (d) x
+      complex(4), intent(in) :: x
+    end function
+    attributes(device) pure complex(8) function __ldcv_c8(x) &
+        bind(c,name='__ldcv_c8x')
+      !dir$ ignore_tkr (d) x
+      complex(8), intent(in) :: x
+    end function
+  end interface
+
+  ! STWB
+  interface __stwb
+    attributes(device) pure subroutine __stwb_i4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(4), value  :: x
+      integer(4), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stwb_i8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(8), value  :: x
+      integer(8), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stwb_cd(y, x) bind(c)
+      import c_devptr
+      !dir$ ignore_tkr (d) y, (d) x
+      type(c_devptr), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwb_r2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(2), value :: x
+      real(2), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stwb_r4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(4), value :: x
+      real(4), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stwb_r8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(8), value :: x
+      real(8), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stwb_c4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (rd) x
+      complex(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwb_c8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      complex(8), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwb_i4x4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(4), dimension(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwb_i8x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(8), dimension(2), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwb_r2x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(2), dimension(2), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwb_r4x4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(4), dimension(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwb_r8x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(8), dimension(2), device, intent(in) :: y, x
+    end subroutine
+  end interface
+
+  ! STCG
+  interface __stcg
+    attributes(device) pure subroutine __stcg_i4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(4), value  :: x
+      integer(4), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stcg_i8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(8), value  :: x
+      integer(8), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stcg_cd(y, x) bind(c)
+      import c_devptr
+      !dir$ ignore_tkr (d) y, (d) x
+      type(c_devptr), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcg_r2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(2), value :: x
+      real(2), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stcg_r4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(4), value :: x
+      real(4), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stcg_r8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(8), value :: x
+      real(8), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stcg_c4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (rd) x
+      complex(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcg_c8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      complex(8), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcg_i4x4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(4), dimension(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcg_i8x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(8), dimension(2), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcg_r2x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(2), dimension(2), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcg_r4x4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(4), dimension(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcg_r8x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(8), dimension(2), device, intent(in) :: y, x
+    end subroutine
+  end interface
+
+  ! STCS
+  interface __stcs
+    attributes(device) pure subroutine __stcs_i4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(4), value  :: x
+      integer(4), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stcs_i8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(8), value  :: x
+      integer(8), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stcs_cd(y, x) bind(c)
+      import c_devptr
+      !dir$ ignore_tkr (d) y, (d) x
+      type(c_devptr), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcs_r2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(2), value :: x
+      real(2), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stcs_r4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(4), value :: x
+      real(4), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stcs_r8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(8), value :: x
+      real(8), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stcs_c4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (rd) x
+      complex(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcs_c8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      complex(8), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcs_i4x4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(4), dimension(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcs_i8x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(8), dimension(2), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcs_r2x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(2), dimension(2), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcs_r4x4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(4), dimension(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stcs_r8x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(8), dimension(2), device, intent(in) :: y, x
+    end subroutine
+  end interface
+
+  ! STWT
+  interface __stwt
+    attributes(device) pure subroutine __stwt_i4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(4), value  :: x
+      integer(4), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stwt_i8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(8), value  :: x
+      integer(8), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stwt_cd(y, x) bind(c)
+      import c_devptr
+      !dir$ ignore_tkr (d) y, (d) x
+      type(c_devptr), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwt_r2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(2), value :: x
+      real(2), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stwt_r4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(4), value :: x
+      real(4), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stwt_r8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(8), value :: x
+      real(8), device, intent(in) :: y
+    end subroutine
+    attributes(device) pure subroutine __stwt_c4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (rd) x
+      complex(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwt_c8(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      complex(8), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwt_i4x4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(4), dimension(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwt_i8x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      integer(8), dimension(2), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwt_r2x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(2), dimension(2), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwt_r4x4(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(4), dimension(4), device, intent(in) :: y, x
+    end subroutine
+    attributes(device) pure subroutine __stwt_r8x2(y, x) bind(c)
+      !dir$ ignore_tkr (d) y, (d) x
+      real(8), dimension(2), device, intent(in) :: y, x
+    end subroutine
+  end interface
+
 end module
diff --git a/flang/test/Lower/OpenMP/Todo/assume.f90 b/flang/test/Lower/OpenMP/Todo/assume.f90
new file mode 100644
index 0000000..1216888
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/assume.f90
@@ -0,0 +1,10 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: OpenMP ASSUME construct
+program p
+  integer r
+  r = 1
+!$omp assume no_parallelism
+  print *,r
+!$omp end assume
+end program p
diff --git a/flang/test/Lower/OpenMP/Todo/assumes.f90 b/flang/test/Lower/OpenMP/Todo/assumes.f90
new file mode 100644
index 0000000..ac26ed1
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/assumes.f90
@@ -0,0 +1,6 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: OpenMP ASSUMES declaration
+program p
+  !$omp assumes no_openmp
+end program p
diff --git a/flang/test/Lower/OpenMP/host-eval.f90 b/flang/test/Lower/OpenMP/host-eval.f90
index 32c5246..65258c9 100644
--- a/flang/test/Lower/OpenMP/host-eval.f90
+++ b/flang/test/Lower/OpenMP/host-eval.f90
@@ -155,3 +155,106 @@ subroutine distribute_parallel_do_simd()
   !$omp end distribute parallel do simd
   !$omp end teams
 end subroutine distribute_parallel_do_simd
+
+! BOTH-LABEL: func.func @_QPdistribute
+subroutine distribute()
+  ! BOTH: omp.target
+  
+  ! HOST-SAME: host_eval(%{{.*}} -> %[[LB:.*]], %{{.*}} -> %[[UB:.*]], %{{.*}} -> %[[STEP:.*]] : i32, i32, i32)
+  
+  ! DEVICE-NOT: host_eval({{.*}})
+  ! DEVICE-SAME: {
+
+  ! BOTH: omp.teams
+  !$omp target teams
+
+  ! BOTH: omp.distribute
+  ! BOTH-NEXT: omp.loop_nest
+
+  ! HOST-SAME: (%{{.*}}) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]])
+  !$omp distribute
+  do i=1,10
+    call foo()
+  end do
+  !$omp end distribute
+  !$omp end target teams
+
+  ! BOTH: omp.target
+  ! BOTH-NOT: host_eval({{.*}})
+  ! BOTH-SAME: {
+  ! BOTH: omp.teams
+  !$omp target teams
+  call foo() !< Prevents this from being Generic-SPMD.
+
+  ! BOTH: omp.distribute
+  !$omp distribute
+  do i=1,10
+    call foo()
+  end do
+  !$omp end distribute
+  !$omp end target teams
+
+  ! BOTH: omp.teams
+  !$omp teams
+
+  ! BOTH: omp.distribute
+  !$omp distribute
+  do i=1,10
+    call foo()
+  end do
+  !$omp end distribute
+  !$omp end teams
+end subroutine distribute
+
+! BOTH-LABEL: func.func @_QPdistribute_simd
+subroutine distribute_simd()
+  ! BOTH: omp.target
+  
+  ! HOST-SAME: host_eval(%{{.*}} -> %[[LB:.*]], %{{.*}} -> %[[UB:.*]], %{{.*}} -> %[[STEP:.*]] : i32, i32, i32)
+  
+  ! DEVICE-NOT: host_eval({{.*}})
+  ! DEVICE-SAME: {
+
+  ! BOTH: omp.teams
+  !$omp target teams
+
+  ! BOTH: omp.distribute
+  ! BOTH-NEXT: omp.simd
+  ! BOTH-NEXT: omp.loop_nest
+
+  ! HOST-SAME: (%{{.*}}) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]])
+  !$omp distribute simd
+  do i=1,10
+    call foo()
+  end do
+  !$omp end distribute simd
+  !$omp end target teams
+
+  ! BOTH: omp.target
+  ! BOTH-NOT: host_eval({{.*}})
+  ! BOTH-SAME: {
+  ! BOTH: omp.teams
+  !$omp target teams
+  call foo() !< Prevents this from being Generic-SPMD.
+
+  ! BOTH: omp.distribute
+  ! BOTH-NEXT: omp.simd
+  !$omp distribute simd
+  do i=1,10
+    call foo()
+  end do
+  !$omp end distribute simd
+  !$omp end target teams
+
+  ! BOTH: omp.teams
+  !$omp teams
+
+  ! BOTH: omp.distribute
+  ! BOTH-NEXT: omp.simd
+  !$omp distribute simd
+  do i=1,10
+    call foo()
+  end do
+  !$omp end distribute simd
+  !$omp end teams
+end subroutine distribute_simd
diff --git a/flang/test/Parser/OpenMP/assumption.f90 b/flang/test/Parser/OpenMP/assumption.f90
new file mode 100644
index 0000000..f1cb0c8
--- /dev/null
+++ b/flang/test/Parser/OpenMP/assumption.f90
@@ -0,0 +1,59 @@
+! RUN: %flang_fc1  -fopenmp-version=51 -fopenmp -fdebug-unparse-no-sema %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1  -fopenmp-version=51 -fopenmp -fdebug-dump-parse-tree-no-sema %s 2>&1 | FileCheck %s --check-prefix="PARSE-TREE"
+subroutine sub1
+  integer :: r
+!CHECK: !$OMP ASSUME NO_OPENMP
+!PARSE-TREE:   ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAssumeConstruct
+!PARSE-TREE:   Verbatim
+!PARSE-TREE:   OmpClauseList -> OmpClause -> NoOpenmp
+  !$omp assume no_openmp
+!CHECK: !$OMP ASSUME NO_PARALLELISM
+!PARSE-TREE:   ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAssumeConstruct
+!PARSE-TREE:   Verbatim
+!PARSE-TREE:   OmpClauseList -> OmpClause -> NoParallelism
+  !$omp assume no_parallelism
+!CHECK: !$OMP ASSUME NO_OPENMP_ROUTINES
+!PARSE-TREE:   ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAssumeConstruct
+!PARSE-TREE:   Verbatim
+!PARSE-TREE:   OmpClauseList -> OmpClause -> NoOpenmpRoutines  
+  !$omp assume no_openmp_routines
+!CHECK: !$OMP ASSUME ABSENT(ALLOCATE), CONTAINS(WORKSHARE,TASK)
+!PARSE-TREE:   ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAssumeConstruct
+!PARSE-TREE:   Verbatim
+!PARSE-TREE:   OmpClauseList -> OmpClause -> Absent -> OmpAbsentClause -> llvm::omp::Directive = allocate
+!PARSE-TREE:   OmpClause -> Contains -> OmpContainsClause -> llvm::omp::Directive = workshare
+!PARSE-TREE:   llvm::omp::Directive = task
+ !$omp assume absent(allocate), contains(workshare, task)
+!CHECK: !$OMP ASSUME HOLDS(1==1)
+  !$omp assume holds(1.eq.1)
+  print *, r
+end subroutine sub1
+
+subroutine sub2
+  integer :: r
+  integer :: v
+!CHECK !$OMP ASSUME NO_OPENMP
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAssumeConstruct
+!PARSE-TREE: OmpAssumeDirective
+!PARSE-TREE: Verbatim
+!PARSE-TREE: OmpClauseList -> OmpClause -> NoOpenmp
+!PARSE-TREE: Block
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt
+!PARSE-TREE: Expr -> Add
+!PARSE-TREE: OmpEndAssumeDirective
+  v = 87
+  !$omp assume no_openmp
+  r = r + 1
+!CHECK !$OMP END ASSUME
+  !$omp end assume
+end subroutine sub2
+  
+program p
+!CHECK !$OMP ASSUMES NO_OPENMP
+!PARSE-TREE: SpecificationPart
+!PARSE-TREE:   OpenMPDeclarativeConstruct -> OpenMPDeclarativeAssumes
+!PARSE-TREE: Verbatim
+!PARSE-TREE: OmpClauseList -> OmpClause -> NoOpenmp
+  !$omp assumes no_openmp
+end program p
+  
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index a4fbba9..daec56f 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -128,6 +128,10 @@ else()
   endif()
 endif()
 
+# Defines LIBC_TARGET_ARCHITECTURE and associated macros.
+set(LIBC_TARGET_TRIPLE "" CACHE STRING "The target triple for the libc build.")
+include(LLVMLibCArchitectures)
+
 # Some targets can only support the full build.
 set(default_to_full_build OFF)
 if(LIBC_TARGET_OS_IS_GPU)
@@ -138,16 +142,11 @@ option(LLVM_LIBC_FULL_BUILD "Build and test LLVM libc as if it is the full libc"
 option(LLVM_LIBC_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR "Build LLVM libc tests assuming our implementation-defined behavior" ON)
 option(LLVM_LIBC_ENABLE_LINTING "Enables linting of libc source files" OFF)
 
-set(LIBC_TARGET_TRIPLE "" CACHE STRING "The target triple for the libc build.")
-
 option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF)
 
 set(LIBC_ENABLE_UNITTESTS ON)
 set(LIBC_ENABLE_HERMETIC_TESTS ${LLVM_LIBC_FULL_BUILD})
 
-# Defines LIBC_TARGET_ARCHITECTURE and associated macros.
-include(LLVMLibCArchitectures)
-
 set(LIBC_CONFIG_JSON_FILE_LIST "")
 
 if(NOT LIBC_CONFIG_PATH)
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index ea8b76e..99f9024 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -98,13 +98,6 @@ function(add_gen_header target_name)
   set(dep_file "${out_file}.d")
   set(yaml_file ${CMAKE_SOURCE_DIR}/${ADD_GEN_HDR_YAML_FILE})
 
-  set(fq_data_files "")
-  if(ADD_GEN_HDR_DATA_FILES)
-    foreach(data_file IN LISTS ADD_GEN_HDR_DATA_FILES)
-      list(APPEND fq_data_files "${CMAKE_CURRENT_SOURCE_DIR}/${data_file}")
-    endforeach(data_file)
-  endif()
-
   set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}")
   list(TRANSFORM entry_points PREPEND "--entry-point=")
 
@@ -117,7 +110,7 @@ function(add_gen_header target_name)
             --write-if-changed
             ${entry_points}
             ${yaml_file}
-    DEPENDS ${yaml_file} ${fq_data_files}
+    DEPENDS ${yaml_file}
     DEPFILE ${dep_file}
     COMMENT "Generating header ${ADD_GEN_HDR_GEN_HDR} from ${yaml_file}"
   )
@@ -133,7 +126,7 @@ function(add_gen_header target_name)
               ${entry_points}
               --output_dir ${decl_out_file}
       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-      DEPENDS ${yaml_file} ${fq_data_files}
+      DEPENDS ${yaml_file}
     )
   endif()
 
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index aac4017..6c2be4d 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -212,8 +212,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.realloc
 
     # stdio.h entrypoints
-    libc.src.stdio.fdopen
-    libc.src.stdio.fileno
     libc.src.stdio.fprintf
     libc.src.stdio.fscanf
     libc.src.stdio.vfscanf
@@ -918,6 +916,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.clearerr
     libc.src.stdio.clearerr_unlocked
     libc.src.stdio.fclose
+    libc.src.stdio.fdopen
     libc.src.stdio.feof
     libc.src.stdio.feof_unlocked
     libc.src.stdio.ferror
@@ -926,6 +925,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.fgetc
     libc.src.stdio.fgetc_unlocked
     libc.src.stdio.fgets
+    libc.src.stdio.fileno
     libc.src.stdio.flockfile
     libc.src.stdio.fopen
     libc.src.stdio.fopencookie
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 6b006f0..fa1e48f 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -211,8 +211,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.realloc
 
     # stdio.h entrypoints
-    libc.src.stdio.fdopen
-    libc.src.stdio.fileno
     libc.src.stdio.fprintf
     libc.src.stdio.fscanf
     libc.src.stdio.vfscanf
@@ -857,6 +855,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.clearerr
     libc.src.stdio.clearerr_unlocked
     libc.src.stdio.fclose
+    libc.src.stdio.fdopen
     libc.src.stdio.feof
     libc.src.stdio.feof_unlocked
     libc.src.stdio.ferror
@@ -865,6 +864,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.fgetc
     libc.src.stdio.fgetc_unlocked
     libc.src.stdio.fgets
+    libc.src.stdio.fileno
     libc.src.stdio.flockfile
     libc.src.stdio.fopen
     libc.src.stdio.fopencookie
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index eaceb15..a07393a 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -177,6 +177,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_trailing_zeros_us
 
     # stdlib.h entrypoints
+    libc.src.stdlib.a64l
     libc.src.stdlib.abs
     libc.src.stdlib.atof
     libc.src.stdlib.atoi
@@ -211,8 +212,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.realloc
 
     # stdio.h entrypoints
-    libc.src.stdio.fdopen
-    libc.src.stdio.fileno
     libc.src.stdio.fprintf
     libc.src.stdio.fscanf
     libc.src.stdio.vfscanf
@@ -1018,6 +1017,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.clearerr
     libc.src.stdio.clearerr_unlocked
     libc.src.stdio.fclose
+    libc.src.stdio.fdopen
     libc.src.stdio.feof
     libc.src.stdio.feof_unlocked
     libc.src.stdio.ferror
@@ -1026,6 +1026,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.fgetc
     libc.src.stdio.fgetc_unlocked
     libc.src.stdio.fgets
+    libc.src.stdio.fileno
     libc.src.stdio.flockfile
     libc.src.stdio.fopen
     libc.src.stdio.fopencookie
diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml
index 8d2b3f3..b308df9 100644
--- a/libc/include/stdlib.yaml
+++ b/libc/include/stdlib.yaml
@@ -24,6 +24,12 @@ functions:
     return_type: _Noreturn void
     arguments:
       - type: int
+  - name: a64l
+    standards:
+      - posix
+    return_type: long
+    arguments:
+      - type: const char *
   - name: abort
     standards:
       - stdc
diff --git a/libc/src/stdio/generic/fileno.cpp b/libc/src/stdio/generic/fileno.cpp
index 0bec180..8fa73d1 100644
--- a/libc/src/stdio/generic/fileno.cpp
+++ b/libc/src/stdio/generic/fileno.cpp
@@ -1,5 +1,4 @@
-//===-- Implementation of fileno
-//-------------------------------------------===//
+//===-- Implementation of fileno ------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index 73a9fbf..361f230 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -185,6 +185,17 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  a64l
+  SRCS
+    a64l.cpp
+  HDRS
+    a64l.h
+  DEPENDS
+    libc.src.__support.ctype_utils
+    libc.hdr.types.size_t
+)
+
+add_entrypoint_object(
   abs
   SRCS
     abs.cpp
diff --git a/libc/src/stdlib/a64l.cpp b/libc/src/stdlib/a64l.cpp
new file mode 100644
index 0000000..5c1b819
--- /dev/null
+++ b/libc/src/stdlib/a64l.cpp
@@ -0,0 +1,64 @@
+//===-- Implementation of a64l --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/a64l.h"
+#include "hdr/types/size_t.h"
+#include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
+#include "src/__support/macros/config.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+// I'm not sure this should go in ctype_utils since the specific ordering of
+// base64 is so very implementation specific, and also this set is unusual.
+// Returns -1 on any char without a specified value.
+constexpr static int32_t b64_char_to_int(char ch) {
+  // from the standard: "The characters used to represent digits are '.' (dot)
+  // for 0, '/' for 1, '0' through '9' for [2,11], 'A' through 'Z' for [12,37],
+  // and 'a' through 'z' for [38,63]."
+  if (ch == '.')
+    return 0;
+  if (ch == '/')
+    return 1;
+
+  // handle the case of an unspecified char.
+  if (!internal::isalnum(ch))
+    return -1;
+
+  bool is_lower = internal::islower(ch);
+  // add 2 to account for '.' and '/', then b36_char_to_int is case insensitive
+  // so add case sensitivity back.
+  return internal::b36_char_to_int(ch) + 2 + (is_lower ? 26 : 0);
+}
+
+// This function takes a base 64 string and writes it to the low 32 bits of a
+// long.
+LLVM_LIBC_FUNCTION(long, a64l, (const char *s)) {
+  // the standard says to only use up to 6 characters.
+  constexpr size_t MAX_LENGTH = 6;
+  int32_t result = 0;
+
+  for (size_t i = 0; i < MAX_LENGTH && s[i] != '\0'; ++i) {
+    int32_t cur_val = b64_char_to_int(s[i]);
+    // The standard says what happens on an unspecified character is undefined,
+    // here we treat it as the end of the string.
+    if (cur_val == -1)
+      break;
+
+    // the first digit is the least significant, so for each subsequent digit we
+    // shift it more. 6 bits since 2^6 = 64
+    result += (cur_val << (6 * i));
+  }
+
+  // standard says to sign extend from 32 bits.
+  return static_cast<long>(result);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/a64l.h b/libc/src/stdlib/a64l.h
new file mode 100644
index 0000000..024be05
--- /dev/null
+++ b/libc/src/stdlib/a64l.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for a64l --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_A64L_H
+#define LLVM_LIBC_SRC_STDLIB_A64L_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+long a64l(const char *s);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDLIB_A64L_H
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index e6c8a62..8481004 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -222,6 +222,16 @@ add_libc_test(
 )
 
 add_libc_test(
+  a64l_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    a64l_test.cpp
+  DEPENDS
+    libc.src.stdlib.a64l
+)
+
+add_libc_test(
   abs_test
   SUITE
     libc-stdlib-tests
diff --git a/libc/test/src/stdlib/a64l_test.cpp b/libc/test/src/stdlib/a64l_test.cpp
new file mode 100644
index 0000000..acdef5d
--- /dev/null
+++ b/libc/test/src/stdlib/a64l_test.cpp
@@ -0,0 +1,87 @@
+//===-- Unittests for a64l ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/a64l.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcA64lTest, EmptyString) { ASSERT_EQ(LIBC_NAMESPACE::a64l(""), 0l); }
+TEST(LlvmLibcA64lTest, FullString) {
+  ASSERT_EQ(LIBC_NAMESPACE::a64l("AbC12/"), 1141696972l);
+}
+
+constexpr char B64_CHARS[64] = {
+    '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A',
+    'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
+    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a',
+    'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+    'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+};
+
+TEST(LlvmLibcA64lTest, OneCharacter) {
+  char example_str[2] = {'\0', '\0'};
+
+  for (size_t i = 0; i < 64; ++i) {
+    example_str[0] = B64_CHARS[i];
+    ASSERT_EQ(LIBC_NAMESPACE::a64l(example_str), static_cast<long>(i));
+  }
+}
+
+TEST(LlvmLibcA64lTest, TwoCharacters) {
+  char example_str[3] = {'\0', '\0', '\0'};
+
+  for (size_t first = 0; first < 64; ++first) {
+    example_str[0] = B64_CHARS[first];
+    for (size_t second = 0; second < 64; ++second) {
+      example_str[1] = B64_CHARS[second];
+
+      ASSERT_EQ(LIBC_NAMESPACE::a64l(example_str),
+                static_cast<long>(first + (second * 64)));
+    }
+  }
+}
+
+TEST(LlvmLibcA64lTest, FiveSameCharacters) {
+  // Technically the last digit can be parsed to give the last two bits. Not
+  // handling that here.
+  char example_str[6] = {
+      '\0', '\0', '\0', '\0', '\0', '\0',
+  };
+
+  // set every 6th bit
+  const long BASE_NUM = 0b1000001000001000001000001;
+
+  for (size_t char_val = 0; char_val < 64; ++char_val) {
+    for (size_t i = 0; i < 5; ++i)
+      example_str[i] = B64_CHARS[char_val];
+
+    const long expected_result = BASE_NUM * char_val;
+
+    ASSERT_EQ(LIBC_NAMESPACE::a64l(example_str), expected_result);
+  }
+}
+
+TEST(LlvmLibcA64lTest, OneOfSixCharacters) {
+  char example_str[7] = {'\0', '\0', '\0', '\0', '\0', '\0', '\0'};
+
+  for (size_t cur_char = 0; cur_char < 6; ++cur_char) {
+    // clear the string, set all the chars to b64(0)
+    for (size_t i = 0; i < 6; ++i)
+      example_str[i] = B64_CHARS[0];
+
+    for (size_t char_val = 0; char_val < 64; ++char_val) {
+      example_str[cur_char] = B64_CHARS[char_val];
+
+      // Need to limit to 32 bits, since that's what the standard says the
+      // function does.
+      const long expected_result =
+          static_cast<int32_t>(char_val << (6 * cur_char));
+
+      ASSERT_EQ(LIBC_NAMESPACE::a64l(example_str), expected_result);
+    }
+  }
+}
diff --git a/libclc/amdgcn/lib/workitem/get_global_size.cl b/libclc/amdgcn/lib/workitem/get_global_size.cl
index 2f28ca6..0fec7e2 100644
--- a/libclc/amdgcn/lib/workitem/get_global_size.cl
+++ b/libclc/amdgcn/lib/workitem/get_global_size.cl
@@ -1,17 +1,13 @@
 #include <clc/clc.h>
 
-uint __clc_amdgcn_get_global_size_x(void) __asm("llvm.r600.read.global.size.x");
-uint __clc_amdgcn_get_global_size_y(void) __asm("llvm.r600.read.global.size.y");
-uint __clc_amdgcn_get_global_size_z(void) __asm("llvm.r600.read.global.size.z");
-
 _CLC_DEF _CLC_OVERLOAD size_t get_global_size(uint dim) {
   switch (dim) {
   case 0:
-    return __clc_amdgcn_get_global_size_x();
+    return __builtin_amdgcn_grid_size_x();
   case 1:
-    return __clc_amdgcn_get_global_size_y();
+    return __builtin_amdgcn_grid_size_y();
   case 2:
-    return __clc_amdgcn_get_global_size_z();
+    return __builtin_amdgcn_grid_size_z();
   default:
     return 1;
   }
diff --git a/libclc/clc/include/clc/float/definitions.h b/libclc/clc/include/clc/float/definitions.h
index 6fea19a..618d02a 100644
--- a/libclc/clc/include/clc/float/definitions.h
+++ b/libclc/clc/include/clc/float/definitions.h
@@ -69,8 +69,6 @@
 
 #ifdef cl_khr_fp16
 
-#if __OPENCL_VERSION__ >= 120
-
 #define HALF_DIG 3
 #define HALF_MANT_DIG 11
 #define HALF_MAX_10_EXP +4
@@ -83,6 +81,6 @@
 #define HALF_MIN 0x1.0p-14h
 #define HALF_EPSILON 0x1.0p-10h
 
-#endif
+#define M_LOG2E_H 0x1.714p+0h
 
 #endif
diff --git a/libclc/clc/include/clc/math/clc_log.h b/libclc/clc/include/clc/math/clc_log.h
new file mode 100644
index 0000000..c4ca135
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_log.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_LOG_H__
+#define __CLC_MATH_CLC_LOG_H__
+
+#define __CLC_FUNCTION __clc_log
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_LOG_H__
diff --git a/libclc/clc/include/clc/math/clc_log10.h b/libclc/clc/include/clc/math/clc_log10.h
new file mode 100644
index 0000000..0592692
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_log10.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_LOG10_H__
+#define __CLC_MATH_CLC_LOG10_H__
+
+#define __CLC_FUNCTION __clc_log10
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_LOG10_H__
diff --git a/libclc/clc/include/clc/math/clc_log2.h b/libclc/clc/include/clc/math/clc_log2.h
new file mode 100644
index 0000000..8b750bd
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_log2.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_LOG2_H__
+#define __CLC_MATH_CLC_LOG2_H__
+
+#define __CLC_FUNCTION __clc_log2
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_LOG2_H__
diff --git a/libclc/clc/include/clc/math/clc_round.h b/libclc/clc/include/clc/math/clc_round.h
new file mode 100644
index 0000000..1402625
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_round.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_ROUND_H__
+#define __CLC_MATH_CLC_ROUND_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_round
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_ROUND_H__
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index 54b0cd2..dbf3c1b 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -23,11 +23,15 @@ math/clc_fabs.cl
 math/clc_fma.cl
 math/clc_floor.cl
 math/clc_frexp.cl
+math/clc_log.cl
+math/clc_log10.cl
+math/clc_log2.cl
 math/clc_mad.cl
 math/clc_modf.cl
 math/clc_nan.cl
 math/clc_nextafter.cl
 math/clc_rint.cl
+math/clc_round.cl
 math/clc_sw_fma.cl
 math/clc_trunc.cl
 relational/clc_all.cl
diff --git a/libclc/clc/lib/generic/math/clc_log.cl b/libclc/clc/lib/generic/math/clc_log.cl
new file mode 100644
index 0000000..ed10871
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_log.cl
@@ -0,0 +1,38 @@
+#include <clc/clcmacro.h>
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_log2.h>
+
+/*
+ *log(x) = log2(x) * (1/log2(e))
+ */
+
+_CLC_OVERLOAD _CLC_DEF float __clc_log(float x) {
+  return __clc_log2(x) * (1.0f / M_LOG2E_F);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_log, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double __clc_log(double x) {
+  return __clc_log2(x) * (1.0 / M_LOG2E);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_log, double);
+
+#endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half __clc_log(half x) {
+  return (half)__clc_log2((float)x) * (1.0h / M_LOG2E_H);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_log, half);
+
+#endif // cl_khr_fp16
diff --git a/libclc/clc/lib/generic/math/clc_log10.cl b/libclc/clc/lib/generic/math/clc_log10.cl
new file mode 100644
index 0000000..8112cb8
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_log10.cl
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+#include <clc/math/tables.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // cl_khr_fp16
+
+#define COMPILING_LOG10
+#include "clc_log_base.h"
+#undef COMPILING_LOG10
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_log10, float);
+
+#ifdef cl_khr_fp64
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_log10, double);
+#endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_log10, half);
+#endif // cl_khr_fp16
diff --git a/libclc/clc/lib/generic/math/clc_log2.cl b/libclc/clc/lib/generic/math/clc_log2.cl
new file mode 100644
index 0000000..edd0d1c
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_log2.cl
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+#include <clc/math/tables.h>
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // cl_khr_fp16
+
+#define COMPILING_LOG2
+#include "clc_log_base.h"
+#undef COMPILING_LOG2
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_log2, float);
+
+#ifdef cl_khr_fp64
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_log2, double);
+#endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_log2, half);
+#endif // cl_khr_fp16
diff --git a/libclc/clc/lib/generic/math/clc_log_base.h b/libclc/clc/lib/generic/math/clc_log_base.h
new file mode 100644
index 0000000..238da27
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_log_base.h
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_fma.h>
+#include <clc/math/clc_mad.h>
+#include <clc/math/math.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+
+/*
+   Algorithm:
+
+   Based on:
+   Ping-Tak Peter Tang
+   "Table-driven implementation of the logarithm function in IEEE
+   floating-point arithmetic"
+   ACM Transactions on Mathematical Software (TOMS)
+   Volume 16, Issue 4 (December 1990)
+
+
+   x very close to 1.0 is handled differently, for x everywhere else
+   a brief explanation is given below
+
+   x = (2^m)*A
+   x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8))
+   x = (2^m)*2*(G/2+g/2)
+   x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9))
+
+   Y = (2^(-1))*(2^(-m))*(2^m)*A
+   Now, range of Y is: 0.5 <= Y < 1
+
+   F = 0x80 + (first 7 mantissa bits) + (8th mantissa bit)
+   Now, range of F is: 128 <= F <= 256
+   F = F / 256
+   Now, range of F is: 0.5 <= F <= 1
+
+   f = -(Y-F), with (f <= 2^(-9))
+
+   log(x) = m*log(2) + log(2) + log(F-f)
+   log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
+   log(x) = m*log(2) + log(2*F) + log(1-r)
+
+   r = (f/F), with (r <= 2^(-8))
+   r = f*(1/F) with (1/F) precomputed to avoid division
+
+   log(x) = m*log(2) + log(G) - poly
+
+   log(G) is precomputed
+   poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5))
+
+   log(2) and log(G) need to be maintained in extra precision
+   to avoid losing precision in the calculations
+
+
+   For x close to 1.0, we employ the following technique to
+   ensure faster convergence.
+
+   log(x) = log((1+s)/(1-s)) = 2*s + (2/3)*s^3 + (2/5)*s^5 + (2/7)*s^7
+   x = ((1+s)/(1-s))
+   x = 1 + r
+   s = r/(2+r)
+
+*/
+
+_CLC_OVERLOAD _CLC_DEF float
+#if defined(COMPILING_LOG2)
+__clc_log2(float x)
+#elif defined(COMPILING_LOG10)
+__clc_log10(float x)
+#else
+__clc_log(float x)
+#endif
+{
+
+#if defined(COMPILING_LOG2)
+  const float LOG2E = 0x1.715476p+0f;      // 1.4426950408889634
+  const float LOG2E_HEAD = 0x1.700000p+0f; // 1.4375
+  const float LOG2E_TAIL = 0x1.547652p-8f; // 0.00519504072
+#elif defined(COMPILING_LOG10)
+  const float LOG10E = 0x1.bcb7b2p-2f;        // 0.43429448190325182
+  const float LOG10E_HEAD = 0x1.bc0000p-2f;   // 0.43359375
+  const float LOG10E_TAIL = 0x1.6f62a4p-11f;  // 0.0007007319
+  const float LOG10_2_HEAD = 0x1.340000p-2f;  // 0.30078125
+  const float LOG10_2_TAIL = 0x1.04d426p-12f; // 0.000248745637
+#else
+  const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
+  const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
+#endif
+
+  uint xi = __clc_as_uint(x);
+  uint ax = xi & EXSIGNBIT_SP32;
+
+  // Calculations for |x-1| < 2^-4
+  float r = x - 1.0f;
+  int near1 = __clc_fabs(r) < 0x1.0p-4f;
+  float u2 = MATH_DIVIDE(r, 2.0f + r);
+  float corr = u2 * r;
+  float u = u2 + u2;
+  float v = u * u;
+  float znear1, z1, z2;
+
+  // 2/(5 * 2^5), 2/(3 * 2^3)
+  z2 = __clc_mad(u, __clc_mad(v, 0x1.99999ap-7f, 0x1.555556p-4f) * v, -corr);
+
+#if defined(COMPILING_LOG2)
+  z1 = __clc_as_float(__clc_as_int(r) & 0xffff0000);
+  z2 = z2 + (r - z1);
+  znear1 = __clc_mad(
+      z1, LOG2E_HEAD,
+      __clc_mad(z2, LOG2E_HEAD, __clc_mad(z1, LOG2E_TAIL, z2 * LOG2E_TAIL)));
+#elif defined(COMPILING_LOG10)
+  z1 = __clc_as_float(__clc_as_int(r) & 0xffff0000);
+  z2 = z2 + (r - z1);
+  znear1 = __clc_mad(
+      z1, LOG10E_HEAD,
+      __clc_mad(z2, LOG10E_HEAD, __clc_mad(z1, LOG10E_TAIL, z2 * LOG10E_TAIL)));
+#else
+  znear1 = z2 + r;
+#endif
+
+  // Calculations for x not near 1
+  int m = (int)(xi >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+  // Normalize subnormal
+  uint xis = __clc_as_uint(__clc_as_float(xi | 0x3f800000) - 1.0f);
+  int ms = (int)(xis >> EXPSHIFTBITS_SP32) - 253;
+  int c = m == -127;
+  m = c ? ms : m;
+  uint xin = c ? xis : xi;
+
+  float mf = (float)m;
+  uint indx = (xin & 0x007f0000) + ((xin & 0x00008000) << 1);
+
+  // F - Y
+  float f = __clc_as_float(0x3f000000 | indx) -
+            __clc_as_float(0x3f000000 | (xin & MANTBITS_SP32));
+
+  indx = indx >> 16;
+  r = f * USE_TABLE(log_inv_tbl, indx);
+
+  // 1/3,  1/2
+  float poly = __clc_mad(__clc_mad(r, 0x1.555556p-2f, 0.5f), r * r, r);
+
+#if defined(COMPILING_LOG2)
+  float2 tv = USE_TABLE(log2_tbl, indx);
+  z1 = tv.s0 + mf;
+  z2 = __clc_mad(poly, -LOG2E, tv.s1);
+#elif defined(COMPILING_LOG10)
+  float2 tv = USE_TABLE(log10_tbl, indx);
+  z1 = __clc_mad(mf, LOG10_2_HEAD, tv.s0);
+  z2 = __clc_mad(poly, -LOG10E, mf * LOG10_2_TAIL) + tv.s1;
+#else
+  float2 tv = USE_TABLE(log_tbl, indx);
+  z1 = __clc_mad(mf, LOG2_HEAD, tv.s0);
+  z2 = __clc_mad(mf, LOG2_TAIL, -poly) + tv.s1;
+#endif
+
+  float z = z1 + z2;
+  z = near1 ? znear1 : z;
+
+  // Corner cases
+  z = ax >= PINFBITPATT_SP32 ? x : z;
+  z = xi != ax ? __clc_as_float(QNANBITPATT_SP32) : z;
+  z = ax == 0 ? __clc_as_float(NINFBITPATT_SP32) : z;
+
+  return z;
+}
+
+#ifdef cl_khr_fp64
+
+_CLC_OVERLOAD _CLC_DEF double
+#if defined(COMPILING_LOG2)
+__clc_log2(double x)
+#elif defined(COMPILING_LOG10)
+__clc_log10(double x)
+#else
+__clc_log(double x)
+#endif
+{
+
+#ifndef COMPILING_LOG2
+  // log2_lead and log2_tail sum to an extra-precise version of ln(2)
+  const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */
+  const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */
+#endif
+
+#if defined(COMPILING_LOG10)
+  // log10e_lead and log10e_tail sum to an extra-precision version of log10(e)
+  // (19 bits in lead)
+  const double log10e_lead =
+      4.34293746948242187500e-01; /* 0x3fdbcb7800000000 */
+  const double log10e_tail =
+      7.3495500964015109100644e-7; /* 0x3ea8a93728719535 */
+#elif defined(COMPILING_LOG2)
+  // log2e_lead and log2e_tail sum to an extra-precision version of log2(e) (19
+  // bits in lead)
+  const double log2e_lead = 1.44269180297851562500E+00; /* 0x3FF7154400000000 */
+  const double log2e_tail = 3.23791044778235969970E-06; /* 0x3ECB295C17F0BBBE */
+#endif
+
+  // log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
+  // log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000
+  const double log_thresh1 = 0x1.e0faap-1;
+  const double log_thresh2 = 0x1.1082cp+0;
+
+  bool is_near = x >= log_thresh1 && x <= log_thresh2;
+
+  // Near 1 code
+  double r = x - 1.0;
+  double u = r / (2.0 + r);
+  double correction = r * u;
+  u = u + u;
+  double v = u * u;
+  double r1 = r;
+
+  const double ca_1 = 8.33333333333317923934e-02; /* 0x3fb55555555554e6 */
+  const double ca_2 = 1.25000000037717509602e-02; /* 0x3f89999999bac6d4 */
+  const double ca_3 = 2.23213998791944806202e-03; /* 0x3f62492307f1519f */
+  const double ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */
+
+  double r2 = __clc_fma(
+      u * v, __clc_fma(v, __clc_fma(v, __clc_fma(v, ca_4, ca_3), ca_2), ca_1),
+      -correction);
+
+#if defined(COMPILING_LOG10)
+  r = r1;
+  r1 = __clc_as_double(__clc_as_ulong(r1) & 0xffffffff00000000);
+  r2 = r2 + (r - r1);
+  double ret_near = __clc_fma(
+      log10e_lead, r1,
+      __clc_fma(log10e_lead, r2, __clc_fma(log10e_tail, r1, log10e_tail * r2)));
+#elif defined(COMPILING_LOG2)
+  r = r1;
+  r1 = __clc_as_double(__clc_as_ulong(r1) & 0xffffffff00000000);
+  r2 = r2 + (r - r1);
+  double ret_near = __clc_fma(
+      log2e_lead, r1,
+      __clc_fma(log2e_lead, r2, __clc_fma(log2e_tail, r1, log2e_tail * r2)));
+#else
+  double ret_near = r1 + r2;
+#endif
+
+  // This is the far from 1 code
+
+  // Deal with subnormal
+  ulong ux = __clc_as_ulong(x);
+  ulong uxs =
+      __clc_as_ulong(__clc_as_double(0x03d0000000000000UL | ux) - 0x1.0p-962);
+  int c = ux < IMPBIT_DP64;
+  ux = c ? uxs : ux;
+  int expadjust = c ? 60 : 0;
+
+  int xexp = ((__clc_as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust;
+  double f = __clc_as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64));
+  int index = __clc_as_int2(ux).hi >> 13;
+  index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1);
+
+  double2 tv = USE_TABLE(ln_tbl, index - 64);
+  double z1 = tv.s0;
+  double q = tv.s1;
+
+  double f1 = index * 0x1.0p-7;
+  double f2 = f - f1;
+  u = f2 / __clc_fma(f2, 0.5, f1);
+  v = u * u;
+
+  const double cb_1 = 8.33333333333333593622e-02; /* 0x3fb5555555555557 */
+  const double cb_2 = 1.24999999978138668903e-02; /* 0x3f89999999865ede */
+  const double cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */
+
+  double poly = v * __clc_fma(v, __clc_fma(v, cb_3, cb_2), cb_1);
+  double z2 = q + __clc_fma(u, poly, u);
+
+  double dxexp = (double)xexp;
+#if defined(COMPILING_LOG10)
+  // Add xexp * log(2) to z1,z2 to get log(x)
+  r1 = __clc_fma(dxexp, log2_lead, z1);
+  r2 = __clc_fma(dxexp, log2_tail, z2);
+  double ret_far = __clc_fma(
+      log10e_lead, r1,
+      __clc_fma(log10e_lead, r2, __clc_fma(log10e_tail, r1, log10e_tail * r2)));
+#elif defined(COMPILING_LOG2)
+  r1 = __clc_fma(log2e_lead, z1, dxexp);
+  r2 = __clc_fma(log2e_lead, z2, __clc_fma(log2e_tail, z1, log2e_tail * z2));
+  double ret_far = r1 + r2;
+#else
+  r1 = __clc_fma(dxexp, log2_lead, z1);
+  r2 = __clc_fma(dxexp, log2_tail, z2);
+  double ret_far = r1 + r2;
+#endif
+
+  double ret = is_near ? ret_near : ret_far;
+
+  ret = __clc_isinf(x) ? __clc_as_double(PINFBITPATT_DP64) : ret;
+  ret = (__clc_isnan(x) | (x < 0.0)) ? __clc_as_double(QNANBITPATT_DP64) : ret;
+  ret = x == 0.0 ? __clc_as_double(NINFBITPATT_DP64) : ret;
+  return ret;
+}
+
+#endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+_CLC_OVERLOAD _CLC_DEF half
+#if defined(COMPILING_LOG2)
+__clc_log2(half x) {
+  return (half)__clc_log2((float)x);
+}
+#elif defined(COMPILING_LOG10)
+__clc_log10(half x) {
+  return (half)__clc_log10((float)x);
+}
+#else
+__clc_log(half x) {
+  return (half)__clc_log((float)x);
+}
+#endif
+
+#endif // cl_khr_fp16
diff --git a/libclc/clc/lib/generic/math/clc_round.cl b/libclc/clc/lib/generic/math/clc_round.cl
new file mode 100644
index 0000000..dfb3ee6
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_round.cl
@@ -0,0 +1,6 @@
+#include <clc/internal/clc.h>
+
+#undef __CLC_FUNCTION
+#define __CLC_FUNCTION __clc_round
+#define __CLC_BUILTIN __builtin_elementwise_round
+#include <clc/math/unary_builtin.inc>
diff --git a/libclc/generic/include/clc/math/log10.h b/libclc/generic/include/clc/math/log10.h
index ec4e4ae..56a4583 100644
--- a/libclc/generic/include/clc/math/log10.h
+++ b/libclc/generic/include/clc/math/log10.h
@@ -1,5 +1,3 @@
-#undef log10
-
 #define __CLC_BODY <clc/math/unary_decl.inc>
 #define __CLC_FUNCTION log10
 
diff --git a/libclc/generic/include/math/binary_intrin.inc b/libclc/generic/include/math/binary_intrin.inc
deleted file mode 100644
index d1a3cae..0000000
--- a/libclc/generic/include/math/binary_intrin.inc
+++ /dev/null
@@ -1,29 +0,0 @@
-_CLC_OVERLOAD float __CLC_FUNCTION(float, float) __asm(__CLC_INTRINSIC ".f32");
-_CLC_OVERLOAD float2 __CLC_FUNCTION(float2, float2) __asm(__CLC_INTRINSIC ".v2f32");
-_CLC_OVERLOAD float3 __CLC_FUNCTION(float3, float3) __asm(__CLC_INTRINSIC ".v3f32");
-_CLC_OVERLOAD float4 __CLC_FUNCTION(float4, float4) __asm(__CLC_INTRINSIC ".v4f32");
-_CLC_OVERLOAD float8 __CLC_FUNCTION(float8, float8) __asm(__CLC_INTRINSIC ".v8f32");
-_CLC_OVERLOAD float16 __CLC_FUNCTION(float16, float16) __asm(__CLC_INTRINSIC ".v16f32");
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-_CLC_OVERLOAD double __CLC_FUNCTION(double, double) __asm(__CLC_INTRINSIC ".f64");
-_CLC_OVERLOAD double2 __CLC_FUNCTION(double2, double2) __asm(__CLC_INTRINSIC ".v2f64");
-_CLC_OVERLOAD double3 __CLC_FUNCTION(double3, double3) __asm(__CLC_INTRINSIC ".v3f64");
-_CLC_OVERLOAD double4 __CLC_FUNCTION(double4, double4) __asm(__CLC_INTRINSIC ".v4f64");
-_CLC_OVERLOAD double8 __CLC_FUNCTION(double8, double8) __asm(__CLC_INTRINSIC ".v8f64");
-_CLC_OVERLOAD double16 __CLC_FUNCTION(double16, double16) __asm(__CLC_INTRINSIC ".v16f64");
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-_CLC_OVERLOAD half __CLC_FUNCTION(half, half) __asm(__CLC_INTRINSIC ".f16");
-_CLC_OVERLOAD half2 __CLC_FUNCTION(half2, half2) __asm(__CLC_INTRINSIC ".v2f16");
-_CLC_OVERLOAD half3 __CLC_FUNCTION(half3, half3) __asm(__CLC_INTRINSIC ".v3f16");
-_CLC_OVERLOAD half4 __CLC_FUNCTION(half4, half4) __asm(__CLC_INTRINSIC ".v4f16");
-_CLC_OVERLOAD half8 __CLC_FUNCTION(half8, half8) __asm(__CLC_INTRINSIC ".v8f16");
-_CLC_OVERLOAD half16 __CLC_FUNCTION(half16, half16) __asm(__CLC_INTRINSIC ".v16f16");
-#endif
-
-#undef __CLC_FUNCTION
-#undef __CLC_INTRINSIC
diff --git a/libclc/generic/include/math/ternary_intrin.inc b/libclc/generic/include/math/ternary_intrin.inc
deleted file mode 100644
index b384b26..0000000
--- a/libclc/generic/include/math/ternary_intrin.inc
+++ /dev/null
@@ -1,30 +0,0 @@
-_CLC_OVERLOAD float __CLC_FUNCTION(float, float, float) __asm(__CLC_INTRINSIC ".f32");
-_CLC_OVERLOAD float2 __CLC_FUNCTION(float2, float2, float2) __asm(__CLC_INTRINSIC ".v2f32");
-_CLC_OVERLOAD float3 __CLC_FUNCTION(float3, float3, float3) __asm(__CLC_INTRINSIC ".v3f32");
-_CLC_OVERLOAD float4 __CLC_FUNCTION(float4, float4, float4) __asm(__CLC_INTRINSIC ".v4f32");
-_CLC_OVERLOAD float8 __CLC_FUNCTION(float8, float8, float8) __asm(__CLC_INTRINSIC ".v8f32");
-_CLC_OVERLOAD float16 __CLC_FUNCTION(float16, float16, float16) __asm(__CLC_INTRINSIC ".v16f32");
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-_CLC_OVERLOAD double __CLC_FUNCTION(double, double, double) __asm(__CLC_INTRINSIC ".f64");
-_CLC_OVERLOAD double2 __CLC_FUNCTION(double2, double2, double2) __asm(__CLC_INTRINSIC ".v2f64");
-_CLC_OVERLOAD double3 __CLC_FUNCTION(double3, double3, double3) __asm(__CLC_INTRINSIC ".v3f64");
-_CLC_OVERLOAD double4 __CLC_FUNCTION(double4, double4, double4) __asm(__CLC_INTRINSIC ".v4f64");
-_CLC_OVERLOAD double8 __CLC_FUNCTION(double8, double8, double8) __asm(__CLC_INTRINSIC ".v8f64");
-_CLC_OVERLOAD double16 __CLC_FUNCTION(double16, double16, double16) __asm(__CLC_INTRINSIC ".v16f64");
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16: enable
-_CLC_OVERLOAD half __CLC_FUNCTION(half, half, half) __asm(__CLC_INTRINSIC ".f16");
-_CLC_OVERLOAD half2 __CLC_FUNCTION(half2, half2, half2) __asm(__CLC_INTRINSIC ".v2f16");
-_CLC_OVERLOAD half3 __CLC_FUNCTION(half3, half3, half3) __asm(__CLC_INTRINSIC ".v3f16");
-_CLC_OVERLOAD half4 __CLC_FUNCTION(half4, half4, half4) __asm(__CLC_INTRINSIC ".v4f16");
-_CLC_OVERLOAD half8 __CLC_FUNCTION(half8, half8, half8) __asm(__CLC_INTRINSIC ".v8f16");
-_CLC_OVERLOAD half16 __CLC_FUNCTION(half16, half16, half16) __asm(__CLC_INTRINSIC ".v16f16");
-#endif
-
-
-#undef __CLC_FUNCTION
-#undef __CLC_INTRINSIC
diff --git a/libclc/generic/lib/math/log.cl b/libclc/generic/lib/math/log.cl
index 336c801..0f87bd2 100644
--- a/libclc/generic/lib/math/log.cl
+++ b/libclc/generic/lib/math/log.cl
@@ -1,26 +1,8 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_log.h>
 
-/*
- *log(x) = log2(x) * (1/log2(e))
- */
+#define FUNCTION log
+#define __CLC_BODY <clc/shared/unary_def.inc>
 
-_CLC_OVERLOAD _CLC_DEF float log(float x)
-{
-    return log2(x) * (1.0f / M_LOG2E_F);
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log, float);
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_OVERLOAD _CLC_DEF double log(double x)
-{
-    return log2(x) * (1.0 / M_LOG2E);
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log, double);
-
-#endif // cl_khr_fp64
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/log10.cl b/libclc/generic/lib/math/log10.cl
index 38e36d7..c374d98 100644
--- a/libclc/generic/lib/math/log10.cl
+++ b/libclc/generic/lib/math/log10.cl
@@ -1,47 +1,8 @@
-/*
- * Copyright (c) 2015 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <clc/math/tables.h>
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_log10.h>
 
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#endif // cl_khr_fp64
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // cl_khr_fp16
-
-#define COMPILING_LOG10
-#include "log_base.h"
-#undef COMPILING_LOG10
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log10, float);
-
-#ifdef cl_khr_fp64
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log10, double);
-#endif // cl_khr_fp64
+#define FUNCTION log10
+#define __CLC_BODY <clc/shared/unary_def.inc>
 
-#ifdef cl_khr_fp16
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, log10, half);
-#endif // cl_khr_fp16
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/log2.cl b/libclc/generic/lib/math/log2.cl
index c3bcf64..f9d94d6 100644
--- a/libclc/generic/lib/math/log2.cl
+++ b/libclc/generic/lib/math/log2.cl
@@ -1,47 +1,8 @@
-/*
- * Copyright (c) 2015 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <clc/math/tables.h>
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_log2.h>
 
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#endif // cl_khr_fp64
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // cl_khr_fp16
-
-#define COMPILING_LOG2
-#include "log_base.h"
-#undef COMPILING_LOG2
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log2, float);
-
-#ifdef cl_khr_fp64
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log2, double);
-#endif // cl_khr_fp64
+#define FUNCTION log2
+#define __CLC_BODY <clc/shared/unary_def.inc>
 
-#ifdef cl_khr_fp16
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, log2, half);
-#endif // cl_khr_fp16
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/log_base.h b/libclc/generic/lib/math/log_base.h
deleted file mode 100644
index bd0169c..0000000
--- a/libclc/generic/lib/math/log_base.h
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <clc/math/math.h>
-
-/*
-   Algorithm:
-
-   Based on:
-   Ping-Tak Peter Tang
-   "Table-driven implementation of the logarithm function in IEEE
-   floating-point arithmetic"
-   ACM Transactions on Mathematical Software (TOMS)
-   Volume 16, Issue 4 (December 1990)
-
-
-   x very close to 1.0 is handled differently, for x everywhere else
-   a brief explanation is given below
-
-   x = (2^m)*A
-   x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8))
-   x = (2^m)*2*(G/2+g/2)
-   x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9))
-
-   Y = (2^(-1))*(2^(-m))*(2^m)*A
-   Now, range of Y is: 0.5 <= Y < 1
-
-   F = 0x80 + (first 7 mantissa bits) + (8th mantissa bit)
-   Now, range of F is: 128 <= F <= 256 
-   F = F / 256 
-   Now, range of F is: 0.5 <= F <= 1
-
-   f = -(Y-F), with (f <= 2^(-9))
-
-   log(x) = m*log(2) + log(2) + log(F-f)
-   log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
-   log(x) = m*log(2) + log(2*F) + log(1-r)
-
-   r = (f/F), with (r <= 2^(-8))
-   r = f*(1/F) with (1/F) precomputed to avoid division
-
-   log(x) = m*log(2) + log(G) - poly
-
-   log(G) is precomputed
-   poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5))
-
-   log(2) and log(G) need to be maintained in extra precision
-   to avoid losing precision in the calculations
-
-
-   For x close to 1.0, we employ the following technique to
-   ensure faster convergence.
-
-   log(x) = log((1+s)/(1-s)) = 2*s + (2/3)*s^3 + (2/5)*s^5 + (2/7)*s^7
-   x = ((1+s)/(1-s)) 
-   x = 1 + r
-   s = r/(2+r)
-
-*/
-
-_CLC_OVERLOAD _CLC_DEF float
-#if defined(COMPILING_LOG2)
-log2(float x)
-#elif defined(COMPILING_LOG10)
-log10(float x)
-#else
-log(float x)
-#endif
-{
-
-#if defined(COMPILING_LOG2)
-    const float LOG2E = 0x1.715476p+0f;      // 1.4426950408889634
-    const float LOG2E_HEAD = 0x1.700000p+0f; // 1.4375
-    const float LOG2E_TAIL = 0x1.547652p-8f; // 0.00519504072
-#elif defined(COMPILING_LOG10)
-    const float LOG10E = 0x1.bcb7b2p-2f;        // 0.43429448190325182
-    const float LOG10E_HEAD = 0x1.bc0000p-2f;   // 0.43359375
-    const float LOG10E_TAIL = 0x1.6f62a4p-11f;  // 0.0007007319
-    const float LOG10_2_HEAD = 0x1.340000p-2f;  // 0.30078125
-    const float LOG10_2_TAIL = 0x1.04d426p-12f; // 0.000248745637
-#else
-    const float LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-    const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-#endif
-
-    uint xi = as_uint(x);
-    uint ax = xi & EXSIGNBIT_SP32;
-
-    // Calculations for |x-1| < 2^-4
-    float r = x - 1.0f;
-    int near1 = fabs(r) < 0x1.0p-4f;
-    float u2 = MATH_DIVIDE(r, 2.0f + r);
-    float corr = u2 * r;
-    float u = u2 + u2;
-    float v = u * u;
-    float znear1, z1, z2;
-
-    // 2/(5 * 2^5), 2/(3 * 2^3)
-    z2 = mad(u, mad(v, 0x1.99999ap-7f, 0x1.555556p-4f)*v, -corr);
-
-#if defined(COMPILING_LOG2)
-    z1 = as_float(as_int(r) & 0xffff0000);
-    z2 = z2 + (r - z1);
-    znear1 = mad(z1, LOG2E_HEAD, mad(z2, LOG2E_HEAD, mad(z1, LOG2E_TAIL, z2*LOG2E_TAIL)));
-#elif defined(COMPILING_LOG10)
-    z1 = as_float(as_int(r) & 0xffff0000);
-    z2 = z2 + (r - z1);
-    znear1 = mad(z1, LOG10E_HEAD, mad(z2, LOG10E_HEAD, mad(z1, LOG10E_TAIL, z2*LOG10E_TAIL)));
-#else
-    znear1 = z2 + r;
-#endif
-
-    // Calculations for x not near 1
-    int m = (int)(xi >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-
-    // Normalize subnormal
-    uint xis = as_uint(as_float(xi | 0x3f800000) - 1.0f);
-    int ms = (int)(xis >> EXPSHIFTBITS_SP32) - 253;
-    int c = m == -127;
-    m = c ? ms : m;
-    uint xin = c ? xis : xi;
-
-    float mf = (float)m;
-    uint indx = (xin & 0x007f0000) + ((xin & 0x00008000) << 1);
-
-    // F - Y
-    float f = as_float(0x3f000000 | indx) - as_float(0x3f000000 | (xin & MANTBITS_SP32));
-
-    indx = indx >> 16;
-    r = f * USE_TABLE(log_inv_tbl, indx);
-
-    // 1/3,  1/2
-    float poly = mad(mad(r, 0x1.555556p-2f, 0.5f), r*r, r);
-
-#if defined(COMPILING_LOG2)
-    float2 tv = USE_TABLE(log2_tbl, indx);
-    z1 = tv.s0 + mf;
-    z2 = mad(poly, -LOG2E, tv.s1);
-#elif defined(COMPILING_LOG10)
-    float2 tv = USE_TABLE(log10_tbl, indx);
-    z1 = mad(mf, LOG10_2_HEAD, tv.s0);
-    z2 = mad(poly, -LOG10E, mf*LOG10_2_TAIL) + tv.s1;
-#else
-    float2 tv = USE_TABLE(log_tbl, indx);
-    z1 = mad(mf, LOG2_HEAD, tv.s0);
-    z2 = mad(mf, LOG2_TAIL, -poly) + tv.s1;
-#endif
-
-    float z = z1 + z2;
-    z = near1 ? znear1 : z;
-
-    // Corner cases
-    z = ax >= PINFBITPATT_SP32 ? x : z;
-    z = xi != ax ? as_float(QNANBITPATT_SP32) : z;
-    z = ax == 0 ? as_float(NINFBITPATT_SP32) : z;
-
-    return z;
-}
-
-#ifdef cl_khr_fp64
-
-_CLC_OVERLOAD _CLC_DEF double
-#if defined(COMPILING_LOG2)
-log2(double x)
-#elif defined(COMPILING_LOG10)
-log10(double x)
-#else
-log(double x)
-#endif
-{
-
-#ifndef COMPILING_LOG2
-    // log2_lead and log2_tail sum to an extra-precise version of ln(2)
-    const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */
-    const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */
-#endif
-
-#if defined(COMPILING_LOG10)
-    // log10e_lead and log10e_tail sum to an extra-precision version of log10(e) (19 bits in lead)
-    const double log10e_lead = 4.34293746948242187500e-01;  /* 0x3fdbcb7800000000 */
-    const double log10e_tail = 7.3495500964015109100644e-7; /* 0x3ea8a93728719535 */
-#elif defined(COMPILING_LOG2)
-    // log2e_lead and log2e_tail sum to an extra-precision version of log2(e) (19 bits in lead)
-    const double log2e_lead = 1.44269180297851562500E+00; /* 0x3FF7154400000000 */
-    const double log2e_tail = 3.23791044778235969970E-06; /* 0x3ECB295C17F0BBBE */
-#endif
-
-    // log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
-    // log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 
-    const double log_thresh1 = 0x1.e0faap-1;
-    const double log_thresh2 = 0x1.1082cp+0;
-
-    bool is_near = x >= log_thresh1 && x <= log_thresh2;
-
-    // Near 1 code
-    double r = x - 1.0;
-    double u = r / (2.0 + r);
-    double correction = r * u;
-    u = u + u;
-    double v = u * u;
-    double r1 = r;
-
-    const double ca_1 = 8.33333333333317923934e-02; /* 0x3fb55555555554e6 */
-    const double ca_2 = 1.25000000037717509602e-02; /* 0x3f89999999bac6d4 */
-    const double ca_3 = 2.23213998791944806202e-03; /* 0x3f62492307f1519f */
-    const double ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */
-
-    double r2 = fma(u*v, fma(v, fma(v, fma(v, ca_4, ca_3), ca_2), ca_1), -correction);
-
-#if defined(COMPILING_LOG10)
-    r = r1;
-    r1 = as_double(as_ulong(r1) & 0xffffffff00000000);
-    r2 = r2 + (r - r1);
-    double ret_near = fma(log10e_lead, r1, fma(log10e_lead, r2, fma(log10e_tail, r1, log10e_tail * r2)));
-#elif defined(COMPILING_LOG2)
-    r = r1;
-    r1 = as_double(as_ulong(r1) & 0xffffffff00000000);
-    r2 = r2 + (r - r1);
-    double ret_near = fma(log2e_lead, r1, fma(log2e_lead, r2, fma(log2e_tail, r1, log2e_tail*r2)));
-#else
-    double ret_near = r1 + r2;
-#endif
-
-    // This is the far from 1 code
-
-    // Deal with subnormal
-    ulong ux = as_ulong(x);
-    ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962);
-    int c = ux < IMPBIT_DP64;
-    ux = c ? uxs : ux;
-    int expadjust = c ? 60 : 0;
-
-    int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust;
-    double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64));
-    int index = as_int2(ux).hi >> 13;
-    index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1);
-
-    double2 tv = USE_TABLE(ln_tbl, index - 64);
-    double z1 = tv.s0;
-    double q = tv.s1;
-
-    double f1 = index * 0x1.0p-7;
-    double f2 = f - f1;
-    u = f2 / fma(f2, 0.5, f1);
-    v = u * u;
-
-    const double cb_1 = 8.33333333333333593622e-02; /* 0x3fb5555555555557 */
-    const double cb_2 = 1.24999999978138668903e-02; /* 0x3f89999999865ede */
-    const double cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */
-
-    double poly = v * fma(v, fma(v, cb_3, cb_2), cb_1);
-    double z2 = q + fma(u, poly, u);
-
-    double dxexp = (double)xexp;
-#if defined (COMPILING_LOG10)
-    // Add xexp * log(2) to z1,z2 to get log(x)
-    r1 = fma(dxexp, log2_lead, z1);
-    r2 = fma(dxexp, log2_tail, z2);
-    double ret_far = fma(log10e_lead, r1, fma(log10e_lead, r2, fma(log10e_tail, r1, log10e_tail*r2)));
-#elif defined(COMPILING_LOG2)
-    r1 = fma(log2e_lead, z1, dxexp);
-    r2 = fma(log2e_lead, z2, fma(log2e_tail, z1, log2e_tail*z2));
-    double ret_far = r1 + r2;
-#else
-    r1 = fma(dxexp, log2_lead, z1);
-    r2 = fma(dxexp, log2_tail, z2);
-    double ret_far = r1 + r2;
-#endif
-
-    double ret = is_near ? ret_near : ret_far;
-
-    ret = isinf(x) ? as_double(PINFBITPATT_DP64) : ret;
-    ret = (isnan(x) | (x < 0.0)) ? as_double(QNANBITPATT_DP64) : ret;
-    ret = x == 0.0 ? as_double(NINFBITPATT_DP64) : ret;
-    return ret;
-}
-
-#endif // cl_khr_fp64
-
-#ifdef cl_khr_fp16
-
-_CLC_OVERLOAD _CLC_DEF half
-#if defined(COMPILING_LOG2)
-log2(half x) {
-  return (half)log2((float)x);
-}
-#elif defined(COMPILING_LOG10)
-log10(half x) {
-  return (half)log10((float)x);
-}
-#else
-log(half x) {
-  return (half)log((float)x);
-}
-#endif
-
-#endif // cl_khr_fp16
diff --git a/libclc/generic/lib/math/round.cl b/libclc/generic/lib/math/round.cl
index 6344051..46f8cf6 100644
--- a/libclc/generic/lib/math/round.cl
+++ b/libclc/generic/lib/math/round.cl
@@ -1,9 +1,6 @@
 #include <clc/clc.h>
-
-// Map the llvm intrinsic to an OpenCL function.
-#define __CLC_FUNCTION __clc_round
-#define __CLC_INTRINSIC "llvm.round"
-#include <clc/math/unary_intrin.inc>
+#include <clc/clcmacro.h>
+#include <clc/math/clc_round.h>
 
 #undef __CLC_FUNCTION
 #define __CLC_FUNCTION round
diff --git a/libcxx/include/future b/libcxx/include/future
index c17e79f..977273e 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -865,7 +865,8 @@ void __async_assoc_state<_Rp, _Fp>::__execute() {
 
 template <class _Rp, class _Fp>
 void __async_assoc_state<_Rp, _Fp>::__on_zero_shared() _NOEXCEPT {
-  this->wait();
+  if (base::__state_ & base::__constructed)
+    this->wait();
   base::__on_zero_shared();
 }
 
@@ -902,7 +903,8 @@ void __async_assoc_state<void, _Fp>::__execute() {
 
 template <class _Fp>
 void __async_assoc_state<void, _Fp>::__on_zero_shared() _NOEXCEPT {
-  this->wait();
+  if (base::__state_ & base::__constructed)
+    this->wait();
   base::__on_zero_shared();
 }
 
diff --git a/libcxx/test/libcxx/atomics/atomics.syn/compatible_with_stdatomic.compile.pass.cpp b/libcxx/test/libcxx/atomics/atomics.syn/compatible_with_stdatomic.compile.pass.cpp
index 30e9672..349dc51 100644
--- a/libcxx/test/libcxx/atomics/atomics.syn/compatible_with_stdatomic.compile.pass.cpp
+++ b/libcxx/test/libcxx/atomics/atomics.syn/compatible_with_stdatomic.compile.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // This test verifies that <stdatomic.h> redirects to <atomic>.
 
 // Before C++23, <stdatomic.h> can be included after <atomic>, but including it
diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp
index 37ab176..455c997 100644
--- a/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp
@@ -15,18 +15,8 @@
 
 // UNSUPPORTED: no-wide-characters
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 
 std::basic_filebuf<char, std::char_traits<wchar_t> > f;
-// expected-error-re@streambuf:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@fstream:* {{only virtual member functions can be marked 'override'}}
+// expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
+// expected-error@*:* 9 {{only virtual member functions can be marked 'override'}}
diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
index f936d8d..cc52cc1 100644
--- a/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
@@ -15,13 +15,11 @@
 
 // UNSUPPORTED: no-wide-characters
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 
 std::basic_fstream<char, std::char_traits<wchar_t> > f;
-// expected-error-re@ios:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
-// expected-error-re@streambuf:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
+// expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
+// expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
 
 // expected-error@*:* 11 {{only virtual member functions can be marked 'override'}}
 
diff --git a/libcxx/test/libcxx/input.output/iostream.format/input.streams/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/iostream.format/input.streams/traits_mismatch.verify.cpp
index 7d713ed..a03aed1 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/input.streams/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/input.streams/traits_mismatch.verify.cpp
@@ -15,13 +15,11 @@
 
 // UNSUPPORTED: no-wide-characters
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <istream>
 #include <string>
 
 struct test_istream
     : public std::basic_istream<char, std::char_traits<wchar_t> > {};
 
-// expected-error-re@ios:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
-// expected-error@istream:* {{only virtual member functions can be marked 'override'}}
+// expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
+// expected-error@*:* {{only virtual member functions can be marked 'override'}}
diff --git a/libcxx/test/libcxx/input.output/iostream.format/output.streams/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/iostream.format/output.streams/traits_mismatch.verify.cpp
index 445ddd1..9e7bc99 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/output.streams/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/output.streams/traits_mismatch.verify.cpp
@@ -15,13 +15,11 @@
 
 // UNSUPPORTED: no-wide-characters
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <ostream>
 #include <string>
 
 struct test_ostream
     : public std::basic_ostream<char, std::char_traits<wchar_t> > {};
 
-// expected-error-re@ios:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
+// expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
 // expected-error@*:* {{only virtual member functions can be marked 'override'}}
diff --git a/libcxx/test/libcxx/input.output/string.streams/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/string.streams/traits_mismatch.verify.cpp
index 89dc884..36f3222 100644
--- a/libcxx/test/libcxx/input.output/string.streams/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/string.streams/traits_mismatch.verify.cpp
@@ -16,16 +16,10 @@
 
 // UNSUPPORTED: no-wide-characters
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 
 std::basic_stringbuf<char, std::char_traits<wchar_t> > sb;
-// expected-error-re@streambuf:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
-// expected-error-re@string:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
+// expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
+// expected-error-re@*:* {{static assertion failed{{.*}}traits_type::char_type must be the same type as CharT}}
 
-// expected-error@sstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@sstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@sstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@sstream:* {{only virtual member functions can be marked 'override'}}
-// expected-error@sstream:* {{only virtual member functions can be marked 'override'}}
+// expected-error@*:* 5 {{only virtual member functions can be marked 'override'}}
diff --git a/libcxx/test/std/containers/sequences/array/array.fill/fill.verify.cpp b/libcxx/test/std/containers/sequences/array/array.fill/fill.verify.cpp
index d82d0d6..1990816 100644
--- a/libcxx/test/std/containers/sequences/array/array.fill/fill.verify.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.fill/fill.verify.cpp
@@ -10,8 +10,6 @@
 
 // void fill(const T& u);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <array>
 #include <cassert>
 
@@ -20,7 +18,7 @@ int main(int, char**) {
     typedef double T;
     typedef std::array<const T, 0> C;
     C c = {};
-    // expected-error-re@array:* {{static assertion failed{{.*}}cannot fill zero-sized array of type 'const T'}}
+    // expected-error-re@*:* {{static assertion failed{{.*}}cannot fill zero-sized array of type 'const T'}}
     c.fill(5.5); // expected-note {{requested here}}
   }
 
diff --git a/libcxx/test/std/containers/sequences/array/array.swap/swap.verify.cpp b/libcxx/test/std/containers/sequences/array/array.swap/swap.verify.cpp
index 4d0ee39..337815c 100644
--- a/libcxx/test/std/containers/sequences/array/array.swap/swap.verify.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.swap/swap.verify.cpp
@@ -10,8 +10,6 @@
 
 // void swap(array& a);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <array>
 #include <cassert>
 
@@ -21,7 +19,7 @@ int main(int, char**) {
     typedef std::array<const T, 0> C;
     C c  = {};
     C c2 = {};
-    // expected-error-re@array:* {{static assertion failed{{.*}}cannot swap zero-sized array of type 'const T'}}
+    // expected-error-re@*:* {{static assertion failed{{.*}}cannot swap zero-sized array of type 'const T'}}
     c.swap(c2); // expected-note {{requested here}}
   }
 
diff --git a/libcxx/test/std/containers/sequences/array/array.tuple/get.verify.cpp b/libcxx/test/std/containers/sequences/array/array.tuple/get.verify.cpp
index 169f06f..0236831 100644
--- a/libcxx/test/std/containers/sequences/array/array.tuple/get.verify.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.tuple/get.verify.cpp
@@ -10,8 +10,6 @@
 
 // template <size_t I, class T, size_t N> T& get(array<T, N>& a);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // Prevent -Warray-bounds from issuing a diagnostic when testing with clang verify.
 // ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-array-bounds
 
@@ -23,5 +21,5 @@ void f() {
   typedef std::array<T, 3> C;
   C c            = {1, 2, 3.5};
   std::get<3>(c) = 5.5; // expected-note {{requested here}}
-  // expected-error-re@array:* {{static assertion failed{{( due to requirement '3U[L]{0,2} < 3U[L]{0,2}')?}}{{.*}}Index out of bounds in std::get<> (std::array)}}
+  // expected-error-re@*:* {{static assertion failed{{( due to requirement '3U[L]{0,2} < 3U[L]{0,2}')?}}{{.*}}Index out of bounds in std::get<> (std::array)}}
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.tuple/tuple_element.verify.cpp b/libcxx/test/std/containers/sequences/array/array.tuple/tuple_element.verify.cpp
index b5c8522..fad333e 100644
--- a/libcxx/test/std/containers/sequences/array/array.tuple/tuple_element.verify.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.tuple/tuple_element.verify.cpp
@@ -10,12 +10,10 @@
 
 // tuple_element<I, array<T, N> >::type
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <array>
 #include <cassert>
 
 typedef double T;
 typedef std::array<T, 3> C;
 std::tuple_element<3, C> foo; // expected-note {{requested here}}
-// expected-error-re@array:* {{static assertion failed{{( due to requirement '3U[L]{0,2} < 3U[L]{0,2}')?}}{{.*}}Index out of bounds in std::tuple_element<> (std::array)}}
+// expected-error-re@*:* {{static assertion failed{{( due to requirement '3U[L]{0,2} < 3U[L]{0,2}')?}}{{.*}}Index out of bounds in std::tuple_element<> (std::array)}}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.comp/equal.pass.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.comp/equal.pass.cpp
index ef0fd5c..9139207 100644
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.comp/equal.pass.cpp
+++ b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.comp/equal.pass.cpp
@@ -19,9 +19,10 @@
 #include <regex>
 
 #include "test_comparisons.h"
+#include "test_macros.h"
 
 int main(int, char**) {
-#if _LIBCPP_STD_VER >= 20
+#if TEST_STD_VER >= 20
   AssertEqualityReturnBool<std::cregex_token_iterator>();
 
   {
diff --git a/libcxx/test/std/strings/basic.string/char.bad.verify.cpp b/libcxx/test/std/strings/basic.string/char.bad.verify.cpp
index c206c1c..557604b 100644
--- a/libcxx/test/std/strings/basic.string/char.bad.verify.cpp
+++ b/libcxx/test/std/strings/basic.string/char.bad.verify.cpp
@@ -9,8 +9,6 @@
 // <string>
 //   ... manipulating sequences of any non-array trivial standard-layout types.
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <string>
 #include "test_traits.h"
 
@@ -35,20 +33,20 @@ void f() {
     typedef char C[3];
     static_assert(std::is_array<C>::value, "");
     std::basic_string<C, test_traits<C> > s;
-    // expected-error-re@string:* {{static assertion failed{{.*}}Character type of basic_string must not be an array}}
+    // expected-error-re@*:* {{static assertion failed{{.*}}Character type of basic_string must not be an array}}
   }
 
   {
     // not trivial
     static_assert(!std::is_trivial<NotTrivial>::value, "");
     std::basic_string<NotTrivial, test_traits<NotTrivial> > s;
-    // expected-error-re@string:* {{static assertion failed{{.*}}Character type of basic_string must be trivial}}
+    // expected-error-re@*:* {{static assertion failed{{.*}}Character type of basic_string must be trivial}}
   }
 
   {
     // not standard layout
     static_assert(!std::is_standard_layout<NotStandardLayout>::value, "");
     std::basic_string<NotStandardLayout, test_traits<NotStandardLayout> > s;
-    // expected-error-re@string:* {{static assertion failed{{.*}}Character type of basic_string must be standard-layout}}
+    // expected-error-re@*:* {{static assertion failed{{.*}}Character type of basic_string must be standard-layout}}
   }
 }
diff --git a/libcxx/test/std/thread/futures/futures.async/thread_create_failure.pass.cpp b/libcxx/test/std/thread/futures/futures.async/thread_create_failure.pass.cpp
new file mode 100644
index 0000000..0d0c630
--- /dev/null
+++ b/libcxx/test/std/thread/futures/futures.async/thread_create_failure.pass.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: no-threads, no-exceptions
+
+// ASan seems to try to create threadsm which obviouly doesn't work in this test.
+// UNSUPPORTED: asan
+// UNSUPPORTED: hwasan
+
+// UNSUPPORTED: c++03
+
+// There is no way to limit the number of threads on windows
+// UNSUPPORTED: windows
+
+// AIX and macOS seem to limit the number of processes, not threads via RLIMIT_NPROC
+// XFAIL: target={{.+}}-aix{{.*}}
+// XFAIL: target={{.+}}-apple-{{.*}}
+
+// This test makes sure that we fail gracefully in care the thread creation fails. This is only reliably possible on
+// systems that allow limiting the number of threads that can be created. See https://llvm.org/PR125428 for more details
+
+#include <cassert>
+#include <future>
+#include <system_error>
+
+#if __has_include(<sys/resource.h>)
+#  include <sys/resource.h>
+#  ifdef RLIMIT_NPROC
+void force_thread_creation_failure() {
+  rlimit lim = {1, 1};
+  assert(setrlimit(RLIMIT_NPROC, &lim) == 0);
+}
+#  else
+#    error "No known way to force only one thread being available"
+#  endif
+#else
+#  error "No known way to force only one thread being available"
+#endif
+
+int main(int, char**) {
+  force_thread_creation_failure();
+
+  try {
+    std::future<int> fut = std::async(std::launch::async, [] { return 1; });
+    assert(false);
+  } catch (const std::system_error&) {
+  }
+
+  try {
+    std::future<void> fut = std::async(std::launch::async, [] { return; });
+    assert(false);
+  } catch (const std::system_error&) {
+  }
+
+  return 0;
+}
diff --git a/lldb/examples/python/fzf_history.py b/lldb/examples/python/fzf_history.py
new file mode 100644
index 0000000..546edb2
--- /dev/null
+++ b/lldb/examples/python/fzf_history.py
@@ -0,0 +1,110 @@
+import os
+import re
+import sys
+import subprocess
+import tempfile
+
+import lldb
+
+
+@lldb.command()
+def fzf_history(debugger, cmdstr, ctx, result, _):
+    """Use fzf to search and select from lldb command history."""
+    history_file = os.path.expanduser("~/.lldb/lldb-widehistory")
+    if not os.path.exists(history_file):
+        result.SetError("history file does not exist")
+        return
+    history = _load_history(history_file)
+
+    if sys.platform != "darwin":
+        # The ability to integrate fzf's result into lldb uses copy and paste.
+        # In absense of copy and paste, run the selected command directly.
+        temp_file = tempfile.NamedTemporaryFile("r")
+        fzf_command = (
+            "fzf",
+            "--no-sort",
+            f"--query={cmdstr}",
+            f"--bind=enter:execute-silent(echo -n {{}} > {temp_file.name})+accept",
+        )
+        subprocess.run(fzf_command, input=history, text=True)
+        command = temp_file.read()
+        debugger.HandleCommand(command)
+        return
+
+    # Capture the current pasteboard contents to restore after overwriting.
+    paste_snapshot = subprocess.run("pbpaste", text=True, capture_output=True).stdout
+
+    # On enter, copy the selected history entry into the pasteboard.
+    fzf_command = (
+        "fzf",
+        "--no-sort",
+        f"--query={cmdstr}",
+        "--bind=enter:execute-silent(echo -n {} | pbcopy)+close",
+    )
+    completed = subprocess.run(fzf_command, input=history, text=True)
+    # 130 is used for CTRL-C or ESC.
+    if completed.returncode not in (0, 130):
+        result.SetError("fzf failed")
+        return
+
+    # Get the user's selected history entry.
+    selected_command = subprocess.run("pbpaste", text=True, capture_output=True).stdout
+    if selected_command == paste_snapshot:
+        # Nothing was selected, no cleanup needed.
+        return
+
+    _handle_command(debugger, selected_command)
+
+    # Restore the pasteboard's contents.
+    subprocess.run("pbcopy", input=paste_snapshot, text=True)
+
+
+def _handle_command(debugger, command):
+    """Try pasting the command, and failing that, run it directly."""
+    if not command:
+        return
+
+    # Use applescript to paste the selected result into lldb's console.
+    paste_command = (
+        "osascript",
+        "-e",
+        'tell application "System Events" to keystroke "v" using command down',
+    )
+    completed = subprocess.run(paste_command, capture_output=True)
+
+    if completed.returncode != 0:
+        # The above applescript requires the "control your computer" permission.
+        #     Settings > Private & Security > Accessibility
+        # If not enabled, fallback to running the command.
+        debugger.HandleCommand(command)
+
+
+def _load_history(history_file):
+    """Load, decode, parse, and prepare an lldb history file for fzf."""
+    with open(history_file) as f:
+        history_contents = f.read()
+
+    history_decoded = re.sub(r"\\0([0-7][0-7])", _decode_char, history_contents)
+    history_lines = history_decoded.splitlines()
+
+    # Skip the header line (_HiStOrY_V2_)
+    del history_lines[0]
+    # Reverse to show latest first.
+    history_lines.reverse()
+
+    history_commands = []
+    history_seen = set()
+    for line in history_lines:
+        line = line.strip()
+        # Skip empty lines, single character commands, and duplicates.
+        if line and len(line) > 1 and line not in history_seen:
+            history_commands.append(line)
+            history_seen.add(line)
+
+    return "\n".join(history_commands)
+
+
+def _decode_char(match):
+    """Decode octal strings ('\0NN') into a single character string."""
+    code = int(match.group(1), base=8)
+    return chr(code)
diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index 29b87de..909ee08 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -326,11 +326,11 @@ public:
   void FindGlobalVariables(const RegularExpression &regex, size_t max_matches,
                            VariableList &variable_list) const;
 
-  /// Finds the first module whose file specification matches \a file_spec.
+  /// Finds modules whose file specification matches \a module_spec.
   ///
   /// \param[in] module_spec
   ///     A file specification object to match against the Module's
-  ///     file specifications. If \a file_spec does not have
+  ///     file specifications. If \a module_spec does not have
   ///     directory information, matches will occur by matching only
   ///     the basename of any modules in this list. If this value is
   ///     NULL, then file specifications won't be compared when
@@ -351,6 +351,7 @@ public:
   // UUID values is very efficient and accurate.
   lldb::ModuleSP FindModule(const UUID &uuid) const;
 
+  /// Finds the first module whose file specification matches \a module_spec.
   lldb::ModuleSP FindFirstModule(const ModuleSpec &module_spec) const;
 
   void FindSymbolsWithNameAndType(ConstString name,
diff --git a/lldb/include/lldb/Symbol/UnwindPlan.h b/lldb/include/lldb/Symbol/UnwindPlan.h
index 48c9bef..de120c4 100644
--- a/lldb/include/lldb/Symbol/UnwindPlan.h
+++ b/lldb/include/lldb/Symbol/UnwindPlan.h
@@ -175,13 +175,13 @@ public:
 
       void SetIsDWARFExpression(const uint8_t *opcodes, uint32_t len);
 
-      const uint8_t *GetDWARFExpressionBytes() {
+      const uint8_t *GetDWARFExpressionBytes() const {
         if (m_type == atDWARFExpression || m_type == isDWARFExpression)
           return m_location.expr.opcodes;
         return nullptr;
       }
 
-      int GetDWARFExpressionLength() {
+      int GetDWARFExpressionLength() const {
         if (m_type == atDWARFExpression || m_type == isDWARFExpression)
           return m_location.expr.length;
         return 0;
@@ -308,13 +308,13 @@ public:
         }
       }
 
-      const uint8_t *GetDWARFExpressionBytes() {
+      const uint8_t *GetDWARFExpressionBytes() const {
         if (m_type == isDWARFExpression)
           return m_value.expr.opcodes;
         return nullptr;
       }
 
-      int GetDWARFExpressionLength() {
+      int GetDWARFExpressionLength() const {
         if (m_type == isDWARFExpression)
           return m_value.expr.length;
         return 0;
@@ -362,8 +362,10 @@ public:
 
     void SlideOffset(lldb::addr_t offset) { m_offset += offset; }
 
+    const FAValue &GetCFAValue() const { return m_cfa_value; }
     FAValue &GetCFAValue() { return m_cfa_value; }
 
+    const FAValue &GetAFAValue() const { return m_afa_value; }
     FAValue &GetAFAValue() { return m_afa_value; }
 
     bool SetRegisterLocationToAtCFAPlusOffset(uint32_t reg_num, int32_t offset,
@@ -464,7 +466,7 @@ public:
   // unknown - the final row in the UnwindPlan is returned. In practice, the
   // UnwindPlan for a function with no known start address will be the
   // architectural default UnwindPlan which will only have one row.
-  UnwindPlan::RowSP GetRowForFunctionOffset(int offset) const;
+  const UnwindPlan::Row *GetRowForFunctionOffset(int offset) const;
 
   lldb::RegisterKind GetRegisterKind() const { return m_register_kind; }
 
@@ -495,9 +497,9 @@ public:
 
   bool IsValidRowIndex(uint32_t idx) const;
 
-  const UnwindPlan::RowSP GetRowAtIndex(uint32_t idx) const;
+  const UnwindPlan::Row *GetRowAtIndex(uint32_t idx) const;
 
-  const UnwindPlan::RowSP GetLastRow() const;
+  const UnwindPlan::Row *GetLastRow() const;
 
   lldb_private::ConstString GetSourceName() const;
 
diff --git a/lldb/include/lldb/Target/RegisterContextUnwind.h b/lldb/include/lldb/Target/RegisterContextUnwind.h
index 3be9eb5..6cd918f 100644
--- a/lldb/include/lldb/Target/RegisterContextUnwind.h
+++ b/lldb/include/lldb/Target/RegisterContextUnwind.h
@@ -191,7 +191,8 @@ private:
 
   // Get the Frame Address register for a given frame.
   bool ReadFrameAddress(lldb::RegisterKind register_kind,
-                          UnwindPlan::Row::FAValue &fa, lldb::addr_t &address);
+                        const UnwindPlan::Row::FAValue &fa,
+                        lldb::addr_t &address);
 
   lldb::UnwindPlanSP GetFastUnwindPlanForFrame();
 
diff --git a/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp b/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
index 0feb927..0b8862f 100644
--- a/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
+++ b/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
@@ -946,8 +946,10 @@ uint32_t SymbolFileCTF::ResolveSymbolContext(const Address &so_addr,
   // Resolve functions.
   if (resolve_scope & eSymbolContextFunction) {
     for (FunctionSP function_sp : m_functions) {
-      if (function_sp->GetAddressRange().ContainsFileAddress(
-              so_addr.GetFileAddress())) {
+      if (llvm::any_of(
+              function_sp->GetAddressRanges(), [&](const AddressRange range) {
+                return range.ContainsFileAddress(so_addr.GetFileAddress());
+              })) {
         sc.function = function_sp.get();
         resolved_flags |= eSymbolContextFunction;
         break;
diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
index 8a26d76..5022310 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
@@ -116,11 +116,9 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly(
 
         // Make a copy of the current instruction Row and save it in m_curr_row
         // so we can add updates as we process the instructions.
-        UnwindPlan::RowSP last_row = unwind_plan.GetLastRow();
-        UnwindPlan::Row *newrow = new UnwindPlan::Row;
-        if (last_row.get())
-          *newrow = *last_row.get();
-        m_curr_row.reset(newrow);
+        UnwindPlan::RowSP last_row =
+            std::make_shared<UnwindPlan::Row>(*unwind_plan.GetLastRow());
+        m_curr_row = std::make_shared<UnwindPlan::Row>(*last_row);
 
         // Add the initial state to the save list with offset 0.
         saved_unwind_states.insert({0, {last_row, m_register_values}});
diff --git a/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp b/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp
index 5c846ba..89a20ee 100644
--- a/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/x86/UnwindAssembly-x86.cpp
@@ -68,8 +68,8 @@ bool UnwindAssembly_x86::AugmentUnwindPlanFromCallSite(
     AddressRange &func, Thread &thread, UnwindPlan &unwind_plan) {
   bool do_augment_unwindplan = true;
 
-  UnwindPlan::RowSP first_row = unwind_plan.GetRowForFunctionOffset(0);
-  UnwindPlan::RowSP last_row = unwind_plan.GetRowForFunctionOffset(-1);
+  const UnwindPlan::Row *first_row = unwind_plan.GetRowForFunctionOffset(0);
+  const UnwindPlan::Row *last_row = unwind_plan.GetLastRow();
 
   int wordsize = 8;
   ProcessSP process_sp(thread.GetProcess());
diff --git a/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp b/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp
index 45d2f95..84f37eb 100644
--- a/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp
@@ -1359,7 +1359,7 @@ bool x86AssemblyInspectionEngine::AugmentUnwindPlanFromCallSite(
   if (unwind_plan.GetRowCount() < 2)
     return false;
 
-  UnwindPlan::RowSP first_row = unwind_plan.GetRowAtIndex(0);
+  const UnwindPlan::Row *first_row = unwind_plan.GetRowAtIndex(0);
   if (first_row->GetOffset() != 0)
     return false;
   uint32_t cfa_reg = first_row->GetCFAValue().GetRegisterNumber();
@@ -1372,7 +1372,7 @@ bool x86AssemblyInspectionEngine::AugmentUnwindPlanFromCallSite(
       first_row->GetCFAValue().GetOffset() != m_wordsize)
     return false;
 
-  UnwindPlan::RowSP original_last_row = unwind_plan.GetRowForFunctionOffset(-1);
+  const UnwindPlan::Row *original_last_row = unwind_plan.GetLastRow();
 
   size_t offset = 0;
   int row_id = 1;
@@ -1417,7 +1417,7 @@ bool x86AssemblyInspectionEngine::AugmentUnwindPlanFromCallSite(
            unwind_plan.GetRowAtIndex(row_id)->GetOffset() <= offset) {
       row_id++;
     }
-    UnwindPlan::RowSP original_row = unwind_plan.GetRowAtIndex(row_id - 1);
+    const UnwindPlan::Row *original_row = unwind_plan.GetRowAtIndex(row_id - 1);
     if (original_row->GetOffset() == offset) {
       *row = *original_row;
       continue;
diff --git a/lldb/source/Symbol/FuncUnwinders.cpp b/lldb/source/Symbol/FuncUnwinders.cpp
index d01a899..0695f0d 100644
--- a/lldb/source/Symbol/FuncUnwinders.cpp
+++ b/lldb/source/Symbol/FuncUnwinders.cpp
@@ -366,11 +366,11 @@ LazyBool FuncUnwinders::CompareUnwindPlansForIdenticalInitialPCLocation(
   RegisterNumber pc_reg(thread, eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC);
   uint32_t pc_reg_lldb_regnum = pc_reg.GetAsKind(eRegisterKindLLDB);
 
-  if (a.get() && b.get()) {
-    UnwindPlan::RowSP a_first_row = a->GetRowAtIndex(0);
-    UnwindPlan::RowSP b_first_row = b->GetRowAtIndex(0);
+  if (a && b) {
+    const UnwindPlan::Row *a_first_row = a->GetRowAtIndex(0);
+    const UnwindPlan::Row *b_first_row = b->GetRowAtIndex(0);
 
-    if (a_first_row.get() && b_first_row.get()) {
+    if (a_first_row && b_first_row) {
       UnwindPlan::Row::AbstractRegisterLocation a_pc_regloc;
       UnwindPlan::Row::AbstractRegisterLocation b_pc_regloc;
 
diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp
index 25e3676..a3df927 100644
--- a/lldb/source/Symbol/UnwindPlan.cpp
+++ b/lldb/source/Symbol/UnwindPlan.cpp
@@ -412,48 +412,44 @@ void UnwindPlan::InsertRow(const UnwindPlan::RowSP &row_sp,
     *it = row_sp;
 }
 
-UnwindPlan::RowSP UnwindPlan::GetRowForFunctionOffset(int offset) const {
+const UnwindPlan::Row *UnwindPlan::GetRowForFunctionOffset(int offset) const {
+  if (m_row_list.empty())
+    return nullptr;
+  if (offset == -1)
+    return m_row_list.back().get();
+
   RowSP row;
-  if (!m_row_list.empty()) {
-    if (offset == -1)
-      row = m_row_list.back();
-    else {
-      collection::const_iterator pos, end = m_row_list.end();
-      for (pos = m_row_list.begin(); pos != end; ++pos) {
-        if ((*pos)->GetOffset() <= static_cast<lldb::offset_t>(offset))
-          row = *pos;
-        else
-          break;
-      }
-    }
+  collection::const_iterator pos, end = m_row_list.end();
+  for (pos = m_row_list.begin(); pos != end; ++pos) {
+    if ((*pos)->GetOffset() <= static_cast<lldb::offset_t>(offset))
+      row = *pos;
+    else
+      break;
   }
-  return row;
+  return row.get();
 }
 
 bool UnwindPlan::IsValidRowIndex(uint32_t idx) const {
   return idx < m_row_list.size();
 }
 
-const UnwindPlan::RowSP UnwindPlan::GetRowAtIndex(uint32_t idx) const {
+const UnwindPlan::Row *UnwindPlan::GetRowAtIndex(uint32_t idx) const {
   if (idx < m_row_list.size())
-    return m_row_list[idx];
-  else {
-    Log *log = GetLog(LLDBLog::Unwind);
-    LLDB_LOGF(log,
-              "error: UnwindPlan::GetRowAtIndex(idx = %u) invalid index "
-              "(number rows is %u)",
-              idx, (uint32_t)m_row_list.size());
-    return UnwindPlan::RowSP();
-  }
+    return m_row_list[idx].get();
+  LLDB_LOG(GetLog(LLDBLog::Unwind),
+           "error: UnwindPlan::GetRowAtIndex(idx = {0}) invalid index "
+           "(number rows is {1})",
+           idx, m_row_list.size());
+  return nullptr;
 }
 
-const UnwindPlan::RowSP UnwindPlan::GetLastRow() const {
+const UnwindPlan::Row *UnwindPlan::GetLastRow() const {
   if (m_row_list.empty()) {
     Log *log = GetLog(LLDBLog::Unwind);
     LLDB_LOGF(log, "UnwindPlan::GetLastRow() when rows are empty");
-    return UnwindPlan::RowSP();
+    return nullptr;
   }
-  return m_row_list.back();
+  return m_row_list.back().get();
 }
 
 int UnwindPlan::GetRowCount() const { return m_row_list.size(); }
@@ -486,7 +482,7 @@ bool UnwindPlan::PlanValidAtAddress(Address addr) {
   // If the 0th Row of unwind instructions is missing, or if it doesn't provide
   // a register to use to find the Canonical Frame Address, this is not a valid
   // UnwindPlan.
-  if (GetRowAtIndex(0).get() == nullptr ||
+  if (GetRowAtIndex(0) == nullptr ||
       GetRowAtIndex(0)->GetCFAValue().GetValueType() ==
           Row::FAValue::unspecified) {
     Log *log = GetLog(LLDBLog::Unwind);
diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp
index dbe885e..b970cb7 100644
--- a/lldb/source/Target/RegisterContextUnwind.cpp
+++ b/lldb/source/Target/RegisterContextUnwind.cpp
@@ -207,7 +207,7 @@ void RegisterContextUnwind::InitializeZerothFrame() {
   m_fast_unwind_plan_sp = GetFastUnwindPlanForFrame();
   m_full_unwind_plan_sp = GetFullUnwindPlanForFrame();
 
-  UnwindPlan::RowSP active_row;
+  const UnwindPlan::Row *active_row;
   lldb::RegisterKind row_register_kind = eRegisterKindGeneric;
 
   // If we have LanguageRuntime UnwindPlan for this unwind, use those
@@ -246,7 +246,7 @@ void RegisterContextUnwind::InitializeZerothFrame() {
     active_row =
         m_full_unwind_plan_sp->GetRowForFunctionOffset(m_current_offset);
     row_register_kind = m_full_unwind_plan_sp->GetRegisterKind();
-    if (active_row.get() && log) {
+    if (active_row && log) {
       StreamString active_row_strm;
       active_row->Dump(active_row_strm, m_full_unwind_plan_sp.get(), &m_thread,
                        m_start_pc.GetLoadAddress(exe_ctx.GetTargetPtr()));
@@ -254,7 +254,7 @@ void RegisterContextUnwind::InitializeZerothFrame() {
     }
   }
 
-  if (!active_row.get()) {
+  if (!active_row) {
     UnwindLogMsg("could not find an unwindplan row for this frame's pc");
     m_frame_type = eNotAValidFrame;
     return;
@@ -442,8 +442,8 @@ void RegisterContextUnwind::InitializeNonZerothFrame() {
       m_current_offset = -1;
       m_current_offset_backed_up_one = -1;
       RegisterKind row_register_kind = m_full_unwind_plan_sp->GetRegisterKind();
-      UnwindPlan::RowSP row = m_full_unwind_plan_sp->GetRowForFunctionOffset(0);
-      if (row.get()) {
+      if (const UnwindPlan::Row *row =
+              m_full_unwind_plan_sp->GetRowForFunctionOffset(0)) {
         if (!ReadFrameAddress(row_register_kind, row->GetCFAValue(), m_cfa)) {
           UnwindLogMsg("failed to get cfa value");
           if (m_frame_type != eSkipFrame) // don't override eSkipFrame
@@ -593,7 +593,7 @@ void RegisterContextUnwind::InitializeNonZerothFrame() {
     }
   }
 
-  UnwindPlan::RowSP active_row;
+  const UnwindPlan::Row *active_row;
   RegisterKind row_register_kind = eRegisterKindGeneric;
 
   // If we have LanguageRuntime UnwindPlan for this unwind, use those
@@ -640,7 +640,7 @@ void RegisterContextUnwind::InitializeNonZerothFrame() {
         m_fast_unwind_plan_sp->GetRowForFunctionOffset(m_current_offset);
     row_register_kind = m_fast_unwind_plan_sp->GetRegisterKind();
     PropagateTrapHandlerFlagFromUnwindPlan(m_fast_unwind_plan_sp);
-    if (active_row.get() && log) {
+    if (active_row && log) {
       StreamString active_row_strm;
       active_row->Dump(active_row_strm, m_fast_unwind_plan_sp.get(), &m_thread,
                        m_start_pc.GetLoadAddress(exe_ctx.GetTargetPtr()));
@@ -655,7 +655,7 @@ void RegisterContextUnwind::InitializeNonZerothFrame() {
           m_current_offset_backed_up_one);
       row_register_kind = m_full_unwind_plan_sp->GetRegisterKind();
       PropagateTrapHandlerFlagFromUnwindPlan(m_full_unwind_plan_sp);
-      if (active_row.get() && log) {
+      if (active_row && log) {
         StreamString active_row_strm;
         active_row->Dump(active_row_strm, m_full_unwind_plan_sp.get(),
                          &m_thread,
@@ -667,7 +667,7 @@ void RegisterContextUnwind::InitializeNonZerothFrame() {
     }
   }
 
-  if (!active_row.get()) {
+  if (!active_row) {
     m_frame_type = eNotAValidFrame;
     UnwindLogMsg("could not find unwind row for this pc");
     return;
@@ -1285,7 +1285,7 @@ RegisterContextUnwind::SavedLocationForRegister(
   RegisterKind unwindplan_registerkind = kNumRegisterKinds;
 
   if (m_fast_unwind_plan_sp) {
-    UnwindPlan::RowSP active_row =
+    const UnwindPlan::Row *active_row =
         m_fast_unwind_plan_sp->GetRowForFunctionOffset(m_current_offset);
     unwindplan_registerkind = m_fast_unwind_plan_sp->GetRegisterKind();
     if (regnum.GetAsKind(unwindplan_registerkind) == LLDB_INVALID_REGNUM) {
@@ -1326,12 +1326,12 @@ RegisterContextUnwind::SavedLocationForRegister(
       RegisterNumber pc_regnum(m_thread, eRegisterKindGeneric,
                                LLDB_REGNUM_GENERIC_PC);
 
-      UnwindPlan::RowSP active_row =
+      const UnwindPlan::Row *active_row =
           m_full_unwind_plan_sp->GetRowForFunctionOffset(
               m_current_offset_backed_up_one);
       unwindplan_registerkind = m_full_unwind_plan_sp->GetRegisterKind();
 
-      if (got_new_full_unwindplan && active_row.get() && log) {
+      if (got_new_full_unwindplan && active_row && log) {
         StreamString active_row_strm;
         ExecutionContext exe_ctx(m_thread.shared_from_this());
         active_row->Dump(active_row_strm, m_full_unwind_plan_sp.get(),
@@ -1455,7 +1455,7 @@ RegisterContextUnwind::SavedLocationForRegister(
         if (ForceSwitchToFallbackUnwindPlan()) {
           // Update for the possibly new unwind plan
           unwindplan_registerkind = m_full_unwind_plan_sp->GetRegisterKind();
-          UnwindPlan::RowSP active_row =
+          const UnwindPlan::Row *active_row =
               m_full_unwind_plan_sp->GetRowForFunctionOffset(m_current_offset);
 
           // Sanity check: Verify that we can fetch a pc value and CFA value
@@ -1799,7 +1799,7 @@ bool RegisterContextUnwind::TryFallbackUnwindPlan() {
 
   m_full_unwind_plan_sp = m_fallback_unwind_plan_sp;
 
-  UnwindPlan::RowSP active_row =
+  const UnwindPlan::Row *active_row =
       m_fallback_unwind_plan_sp->GetRowForFunctionOffset(
           m_current_offset_backed_up_one);
 
@@ -1885,7 +1885,7 @@ bool RegisterContextUnwind::ForceSwitchToFallbackUnwindPlan() {
     return false;
   }
 
-  UnwindPlan::RowSP active_row =
+  const UnwindPlan::Row *active_row =
       m_fallback_unwind_plan_sp->GetRowForFunctionOffset(m_current_offset);
 
   if (active_row &&
@@ -1967,7 +1967,7 @@ void RegisterContextUnwind::PropagateTrapHandlerFlagFromUnwindPlan(
 }
 
 bool RegisterContextUnwind::ReadFrameAddress(
-    lldb::RegisterKind row_register_kind, UnwindPlan::Row::FAValue &fa,
+    lldb::RegisterKind row_register_kind, const UnwindPlan::Row::FAValue &fa,
     addr_t &address) {
   RegisterValue reg_value;
 
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 4d06863..f8061ff 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1670,13 +1670,14 @@ lldb::ValueObjectSP DoGuessValueAt(StackFrame &frame, ConstString reg,
         break;
       case Instruction::Operand::Type::Immediate: {
         SymbolContext sc;
-        Address load_address;
-        if (!frame.CalculateTarget()->ResolveLoadAddress(
-                operands[0].m_immediate, load_address)) {
+        if (!pc.GetModule())
+          break;
+        Address address(operands[0].m_immediate,
+                        pc.GetModule()->GetSectionList());
+        if (!address.IsValid())
           break;
-        }
         frame.CalculateTarget()->GetImages().ResolveSymbolContextForAddress(
-            load_address, eSymbolContextFunction, sc);
+            address, eSymbolContextFunction, sc);
         if (!sc.function) {
           break;
         }
diff --git a/lldb/source/Target/ThreadPlanStepRange.cpp b/lldb/source/Target/ThreadPlanStepRange.cpp
index de4cd59..78e1270 100644
--- a/lldb/source/Target/ThreadPlanStepRange.cpp
+++ b/lldb/source/Target/ThreadPlanStepRange.cpp
@@ -197,9 +197,11 @@ bool ThreadPlanStepRange::InRange() {
 bool ThreadPlanStepRange::InSymbol() {
   lldb::addr_t cur_pc = GetThread().GetRegisterContext()->GetPC();
   if (m_addr_context.function != nullptr) {
-    return m_addr_context.function->GetAddressRange().ContainsLoadAddress(
-        cur_pc, &GetTarget());
-  } else if (m_addr_context.symbol && m_addr_context.symbol->ValueIsAddress()) {
+    AddressRange unused_range;
+    return m_addr_context.function->GetRangeContainingLoadAddress(
+        cur_pc, GetTarget(), unused_range);
+  }
+  if (m_addr_context.symbol && m_addr_context.symbol->ValueIsAddress()) {
     AddressRange range(m_addr_context.symbol->GetAddressRef(),
                        m_addr_context.symbol->GetByteSize());
     return range.ContainsLoadAddress(cur_pc, &GetTarget());
diff --git a/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py b/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py
index d0f6ebe..f7b5966 100644
--- a/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py
+++ b/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py
@@ -10,7 +10,7 @@ from lldbsuite.test import lldbutil
 
 
 class TestDiagnoseDereferenceFunctionReturn(TestBase):
-    @expectedFailureAll(oslist=no_match(lldbplatformutil.getDarwinOSTriples()))
+    @expectedFailureAll(oslist=["windows"])
     @skipIf(
         archs=no_match(["x86_64"])
     )  # <rdar://problem/33842388> frame diagnose doesn't work for armv7 or arm64
@@ -19,9 +19,6 @@ class TestDiagnoseDereferenceFunctionReturn(TestBase):
         TestBase.setUp(self)
         self.build()
         exe = self.getBuildArtifact("a.out")
-        # FIXME: This default changed in lldbtest.py and this test
-        # seems to rely on having it turned off.
-        self.runCmd("settings set target.disable-aslr true")
         self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET)
         self.runCmd("run", RUN_SUCCEEDED)
         self.expect("thread list", "Thread should be stopped", substrs=["stopped"])
diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index 804dd8e..8b3c520 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -37,6 +37,7 @@ add_lldb_tool(lldb-dap
   SourceBreakpoint.cpp
   Watchpoint.cpp
 
+  Handler/ResponseHandler.cpp
   Handler/AttachRequestHandler.cpp
   Handler/BreakpointLocationsHandler.cpp
   Handler/CompileUnitsRequestHandler.cpp
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 01f294e..37bc1f6 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "DAP.h"
+#include "Handler/ResponseHandler.h"
 #include "JSONUtils.h"
 #include "LLDBUtils.h"
 #include "OutputRedirector.h"
@@ -34,6 +35,7 @@
 #include <cstdarg>
 #include <cstdio>
 #include <fstream>
+#include <memory>
 #include <mutex>
 #include <utility>
 
@@ -769,10 +771,8 @@ bool DAP::HandleObject(const llvm::json::Object &object) {
 
   if (packet_type == "response") {
     auto id = GetSigned(object, "request_seq", 0);
-    ResponseCallback response_handler = [](llvm::Expected<llvm::json::Value>) {
-      llvm::errs() << "Unhandled response\n";
-    };
 
+    std::unique_ptr<ResponseHandler> response_handler;
     {
       std::lock_guard<std::mutex> locker(call_mutex);
       auto inflight = inflight_reverse_requests.find(id);
@@ -782,19 +782,22 @@ bool DAP::HandleObject(const llvm::json::Object &object) {
       }
     }
 
+    if (!response_handler)
+      response_handler = std::make_unique<UnknownResponseHandler>("", id);
+
     // Result should be given, use null if not.
     if (GetBoolean(object, "success", false)) {
       llvm::json::Value Result = nullptr;
       if (auto *B = object.get("body")) {
         Result = std::move(*B);
       }
-      response_handler(Result);
+      (*response_handler)(Result);
     } else {
       llvm::StringRef message = GetString(object, "message");
       if (message.empty()) {
         message = "Unknown error, response failed";
       }
-      response_handler(llvm::createStringError(
+      (*response_handler)(llvm::createStringError(
           std::error_code(-1, std::generic_category()), message));
     }
 
@@ -875,24 +878,6 @@ llvm::Error DAP::Loop() {
   return llvm::Error::success();
 }
 
-void DAP::SendReverseRequest(llvm::StringRef command,
-                             llvm::json::Value arguments,
-                             ResponseCallback callback) {
-  int64_t id;
-  {
-    std::lock_guard<std::mutex> locker(call_mutex);
-    id = ++reverse_request_seq;
-    inflight_reverse_requests.emplace(id, std::move(callback));
-  }
-
-  SendJSON(llvm::json::Object{
-      {"type", "request"},
-      {"seq", id},
-      {"command", command},
-      {"arguments", std::move(arguments)},
-  });
-}
-
 lldb::SBError DAP::WaitForProcessToStop(uint32_t seconds) {
   lldb::SBError error;
   lldb::SBProcess process = target.GetProcess();
@@ -1007,17 +992,10 @@ bool StartDebuggingRequestHandler::DoExecute(
     return false;
   }
 
-  dap.SendReverseRequest(
+  dap.SendReverseRequest<LogFailureResponseHandler>(
       "startDebugging",
       llvm::json::Object{{"request", request},
-                         {"configuration", std::move(*configuration)}},
-      [](llvm::Expected<llvm::json::Value> value) {
-        if (!value) {
-          llvm::Error err = value.takeError();
-          llvm::errs() << "reverse start debugging request failed: "
-                       << llvm::toString(std::move(err)) << "\n";
-        }
-      });
+                         {"configuration", std::move(*configuration)}});
 
   result.SetStatus(lldb::eReturnStatusSuccessFinishNoResult);
 
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index f87841a..7ceb1d1 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -13,6 +13,7 @@
 #include "ExceptionBreakpoint.h"
 #include "FunctionBreakpoint.h"
 #include "Handler/RequestHandler.h"
+#include "Handler/ResponseHandler.h"
 #include "IOStream.h"
 #include "InstructionBreakpoint.h"
 #include "OutputRedirector.h"
@@ -68,8 +69,6 @@ enum DAPBroadcasterBits {
   eBroadcastBitStopProgressThread = 1u << 1
 };
 
-typedef void (*ResponseCallback)(llvm::Expected<llvm::json::Value> value);
-
 enum class PacketStatus {
   Success = 0,
   EndOfFile,
@@ -197,7 +196,7 @@ struct DAP {
   llvm::DenseSet<lldb::tid_t> thread_ids;
   uint32_t reverse_request_seq;
   std::mutex call_mutex;
-  std::map<int /* request_seq */, ResponseCallback /* reply handler */>
+  llvm::SmallDenseMap<int64_t, std::unique_ptr<ResponseHandler>>
       inflight_reverse_requests;
   ReplMode repl_mode;
   std::string command_escape_prefix = "`";
@@ -327,12 +326,24 @@ struct DAP {
   ///   The reverse request command.
   ///
   /// \param[in] arguments
-  ///   The reverse request arguements.
-  ///
-  /// \param[in] callback
-  ///   A callback to execute when the response arrives.
-  void SendReverseRequest(llvm::StringRef command, llvm::json::Value arguments,
-                          ResponseCallback callback);
+  ///   The reverse request arguments.
+  template <typename Handler>
+  void SendReverseRequest(llvm::StringRef command,
+                          llvm::json::Value arguments) {
+    int64_t id;
+    {
+      std::lock_guard<std::mutex> locker(call_mutex);
+      id = ++reverse_request_seq;
+      inflight_reverse_requests[id] = std::make_unique<Handler>(command, id);
+    }
+
+    SendJSON(llvm::json::Object{
+        {"type", "request"},
+        {"seq", id},
+        {"command", command},
+        {"arguments", std::move(arguments)},
+    });
+  }
 
   /// Registers a request handler.
   template <typename Handler> void RegisterRequest() {
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
index ad00e43..0a32e39 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
@@ -101,15 +101,8 @@ static llvm::Error RunInTerminal(DAP &dap,
 #endif
   llvm::json::Object reverse_request = CreateRunInTerminalReverseRequest(
       launch_request, dap.debug_adaptor_path, comm_file.m_path, debugger_pid);
-  dap.SendReverseRequest("runInTerminal", std::move(reverse_request),
-                         [](llvm::Expected<llvm::json::Value> value) {
-                           if (!value) {
-                             llvm::Error err = value.takeError();
-                             llvm::errs()
-                                 << "runInTerminal request failed: "
-                                 << llvm::toString(std::move(err)) << "\n";
-                           }
-                         });
+  dap.SendReverseRequest<LogFailureResponseHandler>("runInTerminal",
+                                                    std::move(reverse_request));
 
   if (llvm::Expected<lldb::pid_t> pid = comm_channel.GetLauncherPid())
     attach_info.SetProcessID(*pid);
diff --git a/lldb/tools/lldb-dap/Handler/ResponseHandler.cpp b/lldb/tools/lldb-dap/Handler/ResponseHandler.cpp
new file mode 100644
index 0000000..27a1437
--- /dev/null
+++ b/lldb/tools/lldb-dap/Handler/ResponseHandler.cpp
@@ -0,0 +1,35 @@
+//===-- ResponseHandler.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ResponseHandler.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace lldb_dap {
+
+void UnknownResponseHandler::operator()(
+    llvm::Expected<llvm::json::Value> value) const {
+  llvm::errs() << "unexpected response: ";
+  if (value) {
+    if (std::optional<llvm::StringRef> str = value->getAsString())
+      llvm::errs() << *str;
+  } else {
+    llvm::errs() << "error: " << llvm::toString(value.takeError());
+  }
+  llvm::errs() << '\n';
+}
+
+void LogFailureResponseHandler::operator()(
+    llvm::Expected<llvm::json::Value> value) const {
+  if (!value)
+    llvm::errs() << "reverse request \"" << m_command << "\" (" << m_id
+                 << ") failed: " << llvm::toString(value.takeError()) << '\n';
+}
+
+} // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Handler/ResponseHandler.h b/lldb/tools/lldb-dap/Handler/ResponseHandler.h
new file mode 100644
index 0000000..b09153a
--- /dev/null
+++ b/lldb/tools/lldb-dap/Handler/ResponseHandler.h
@@ -0,0 +1,56 @@
+//===-- ResponseHandler.h -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_TOOLS_LLDB_DAP_HANDLER_RESPONSEHANDLER_H
+#define LLDB_TOOLS_LLDB_DAP_HANDLER_RESPONSEHANDLER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/JSON.h"
+#include <cstdint>
+
+namespace lldb_dap {
+struct DAP;
+
+/// Handler for responses to reverse requests.
+class ResponseHandler {
+public:
+  ResponseHandler(llvm::StringRef command, int64_t id)
+      : m_command(command), m_id(id) {}
+
+  /// ResponseHandlers are not copyable.
+  /// @{
+  ResponseHandler(const ResponseHandler &) = delete;
+  ResponseHandler &operator=(const ResponseHandler &) = delete;
+  /// @}
+
+  virtual ~ResponseHandler() = default;
+
+  virtual void operator()(llvm::Expected<llvm::json::Value> value) const = 0;
+
+protected:
+  llvm::StringRef m_command;
+  int64_t m_id;
+};
+
+/// Response handler used for unknown responses.
+class UnknownResponseHandler : public ResponseHandler {
+public:
+  using ResponseHandler::ResponseHandler;
+  void operator()(llvm::Expected<llvm::json::Value> value) const override;
+};
+
+/// Response handler which logs to stderr in case of a failure.
+class LogFailureResponseHandler : public ResponseHandler {
+public:
+  using ResponseHandler::ResponseHandler;
+  void operator()(llvm::Expected<llvm::json::Value> value) const override;
+};
+
+} // namespace lldb_dap
+
+#endif
diff --git a/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp b/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
index 12eb577..191cd3d 100644
--- a/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
+++ b/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
@@ -59,7 +59,7 @@ TEST_F(TestArm64InstEmulation, TestSimpleDarwinFunction) {
           UnwindAssemblyInstEmulation::CreateInstance(arch)));
   ASSERT_NE(nullptr, engine);
 
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   UnwindPlan::Row::AbstractRegisterLocation regloc;
@@ -89,74 +89,74 @@ TEST_F(TestArm64InstEmulation, TestSimpleDarwinFunction) {
       sample_range, data, sizeof(data), unwind_plan));
 
   // CFA=sp +0 => fp= <same> lr= <same>
-  row_sp = unwind_plan.GetRowForFunctionOffset(0);
-  EXPECT_EQ(0ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(0);
+  EXPECT_EQ(0ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_fp_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 
   // CFA=sp+16 => fp=[CFA-16] lr=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_fp_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // CFA=fp+16 => fp=[CFA-16] lr=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(8);
-  EXPECT_EQ(8ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(8);
+  EXPECT_EQ(8ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_fp_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // CFA=sp+16 => fp=[CFA-16] lr=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(16);
-  EXPECT_EQ(16ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(16);
+  EXPECT_EQ(16ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_fp_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // CFA=sp +0 => fp= <same> lr= <same>
-  row_sp = unwind_plan.GetRowForFunctionOffset(20);
-  EXPECT_EQ(20ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(20);
+  EXPECT_EQ(20ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_fp_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 }
 
@@ -167,7 +167,7 @@ TEST_F(TestArm64InstEmulation, TestMediumDarwinFunction) {
           UnwindAssemblyInstEmulation::CreateInstance(arch)));
   ASSERT_NE(nullptr, engine);
 
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   UnwindPlan::Row::AbstractRegisterLocation regloc;
@@ -218,107 +218,107 @@ TEST_F(TestArm64InstEmulation, TestMediumDarwinFunction) {
       sample_range, data, sizeof(data), unwind_plan));
 
   // 0: CFA=sp +0 =>
-  row_sp = unwind_plan.GetRowForFunctionOffset(0);
-  EXPECT_EQ(0ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(0);
+  EXPECT_EQ(0ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
   // 4: CFA=sp+48 => x21=[CFA-40] x22=[CFA-48]
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_EQ(48, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_EQ(48, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_x21_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_x21_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-40, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_x22_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_x22_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-48, regloc.GetOffset());
 
   // 8: CFA=sp+48 => x19=[CFA-24] x20=[CFA-32] x21=[CFA-40] x22=[CFA-48]
-  row_sp = unwind_plan.GetRowForFunctionOffset(8);
-  EXPECT_EQ(8ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_EQ(48, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(8);
+  EXPECT_EQ(8ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_EQ(48, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_x19_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_x19_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-24, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_x20_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_x20_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-32, regloc.GetOffset());
 
   // 12: CFA=sp+48 => x19=[CFA-24] x20=[CFA-32] x21=[CFA-40] x22=[CFA-48]
   // fp=[CFA-16] lr=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(12);
-  EXPECT_EQ(12ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_EQ(48, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(12);
+  EXPECT_EQ(12ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_EQ(48, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_fp_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 16: CFA=fp+16 => x19=[CFA-24] x20=[CFA-32] x21=[CFA-40] x22=[CFA-48]
   // fp=[CFA-16] lr=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(16);
-  EXPECT_EQ(16ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(16);
+  EXPECT_EQ(16ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
   // 28: CFA=sp+48 => x19=[CFA-24] x20=[CFA-32] x21=[CFA-40] x22=[CFA-48]
   // fp=[CFA-16] lr=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(28);
-  EXPECT_EQ(28ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(48, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(28);
+  EXPECT_EQ(28ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(48, row->GetCFAValue().GetOffset());
 
   // 32: CFA=sp+48 => x19=[CFA-24] x20=[CFA-32] x21=[CFA-40] x22=[CFA-48] fp=
   // <same> lr= <same>
-  row_sp = unwind_plan.GetRowForFunctionOffset(32);
-  EXPECT_EQ(32ull, row_sp->GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(32);
+  EXPECT_EQ(32ull, row->GetOffset());
 
   // I'd prefer if these restored registers were cleared entirely instead of set
   // to IsSame...
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_fp_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 
   // 36: CFA=sp+48 => x19= <same> x20= <same> x21=[CFA-40] x22=[CFA-48] fp=
   // <same> lr= <same>
-  row_sp = unwind_plan.GetRowForFunctionOffset(36);
-  EXPECT_EQ(36ull, row_sp->GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(36);
+  EXPECT_EQ(36ull, row->GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_x19_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_x19_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_x20_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_x20_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 
   // 40: CFA=sp +0 => x19= <same> x20= <same> x21= <same> x22= <same> fp= <same>
   // lr= <same>
-  row_sp = unwind_plan.GetRowForFunctionOffset(40);
-  EXPECT_EQ(40ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(40);
+  EXPECT_EQ(40ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_x21_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_x21_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_x22_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_x22_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 }
 
@@ -329,7 +329,7 @@ TEST_F(TestArm64InstEmulation, TestFramelessThreeEpilogueFunction) {
           UnwindAssemblyInstEmulation::CreateInstance(arch)));
   ASSERT_NE(nullptr, engine);
 
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   UnwindPlan::Row::AbstractRegisterLocation regloc;
@@ -372,53 +372,53 @@ TEST_F(TestArm64InstEmulation, TestFramelessThreeEpilogueFunction) {
       sample_range, data, sizeof(data), unwind_plan));
 
   // 0: CFA=sp +0 =>
-  row_sp = unwind_plan.GetRowForFunctionOffset(0);
-  EXPECT_EQ(0ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
-
-  row_sp = unwind_plan.GetRowForFunctionOffset(32);
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
-
-  EXPECT_FALSE(row_sp->GetRegisterInfo(gpr_x19_arm64, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(gpr_x20_arm64, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(gpr_x21_arm64, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(gpr_x22_arm64, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(gpr_x23_arm64, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(gpr_x24_arm64, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(gpr_x25_arm64, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(gpr_x26_arm64, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(gpr_x27_arm64, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(gpr_x28_arm64, regloc));
-
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_fp_arm64, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(0);
+  EXPECT_EQ(0ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
+
+  row = unwind_plan.GetRowForFunctionOffset(32);
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
+
+  EXPECT_FALSE(row->GetRegisterInfo(gpr_x19_arm64, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(gpr_x20_arm64, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(gpr_x21_arm64, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(gpr_x22_arm64, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(gpr_x23_arm64, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(gpr_x24_arm64, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(gpr_x25_arm64, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(gpr_x26_arm64, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(gpr_x27_arm64, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(gpr_x28_arm64, regloc));
+
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_arm64, regloc));
   EXPECT_TRUE(regloc.IsSame());
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(36);
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
-
-  row_sp = unwind_plan.GetRowForFunctionOffset(52);
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
-
-  row_sp = unwind_plan.GetRowForFunctionOffset(56);
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
-
-  row_sp = unwind_plan.GetRowForFunctionOffset(60);
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(36);
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
+
+  row = unwind_plan.GetRowForFunctionOffset(52);
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
+
+  row = unwind_plan.GetRowForFunctionOffset(56);
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
+
+  row = unwind_plan.GetRowForFunctionOffset(60);
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(TestArm64InstEmulation, TestRegisterSavedTwice) {
@@ -428,7 +428,7 @@ TEST_F(TestArm64InstEmulation, TestRegisterSavedTwice) {
           UnwindAssemblyInstEmulation::CreateInstance(arch)));
   ASSERT_NE(nullptr, engine);
 
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   UnwindPlan::Row::AbstractRegisterLocation regloc;
@@ -502,23 +502,23 @@ TEST_F(TestArm64InstEmulation, TestRegisterSavedTwice) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       sample_range, data, sizeof(data), unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(36);
-  EXPECT_EQ(28ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(36);
+  EXPECT_EQ(28ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_x20_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_x20_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-32, regloc.GetOffset());
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(40);
-  EXPECT_EQ(28ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(40);
+  EXPECT_EQ(28ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_x20_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_x20_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-32, regloc.GetOffset());
 }
@@ -530,7 +530,7 @@ TEST_F(TestArm64InstEmulation, TestRegisterDoubleSpills) {
           UnwindAssemblyInstEmulation::CreateInstance(arch)));
   ASSERT_NE(nullptr, engine);
 
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   UnwindPlan::Row::AbstractRegisterLocation regloc;
@@ -618,79 +618,79 @@ TEST_F(TestArm64InstEmulation, TestRegisterDoubleSpills) {
   //  28: CFA=fp+16 => x27=[CFA-24] x28=[CFA-32] fp=[CFA-16] lr=[CFA-8]
   //  d8=[CFA-40] d9=[CFA-48] d10=[CFA-56] d11=[CFA-64] d12=[CFA-72]
   //  d13=[CFA-80] d14=[CFA-88] d15=[CFA-96]
-  row_sp = unwind_plan.GetRowForFunctionOffset(28);
-  EXPECT_EQ(28ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(28);
+  EXPECT_EQ(28ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(fpu_d15_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(fpu_d15_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-96, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(fpu_d14_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(fpu_d14_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-88, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(fpu_d13_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(fpu_d13_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-80, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(fpu_d12_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(fpu_d12_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-72, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(fpu_d11_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(fpu_d11_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-64, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(fpu_d10_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(fpu_d10_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-56, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(fpu_d9_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(fpu_d9_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-48, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(fpu_d8_arm64, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(fpu_d8_arm64, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-40, regloc.GetOffset());
 
   //  60: CFA=sp +0 =>
-  row_sp = unwind_plan.GetRowForFunctionOffset(60);
-  EXPECT_EQ(60ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(60);
+  EXPECT_EQ(60ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
-  if (row_sp->GetRegisterInfo(fpu_d8_arm64, regloc)) {
+  if (row->GetRegisterInfo(fpu_d8_arm64, regloc)) {
     EXPECT_TRUE(regloc.IsSame());
   }
-  if (row_sp->GetRegisterInfo(fpu_d9_arm64, regloc)) {
+  if (row->GetRegisterInfo(fpu_d9_arm64, regloc)) {
     EXPECT_TRUE(regloc.IsSame());
   }
-  if (row_sp->GetRegisterInfo(fpu_d10_arm64, regloc)) {
+  if (row->GetRegisterInfo(fpu_d10_arm64, regloc)) {
     EXPECT_TRUE(regloc.IsSame());
   }
-  if (row_sp->GetRegisterInfo(fpu_d11_arm64, regloc)) {
+  if (row->GetRegisterInfo(fpu_d11_arm64, regloc)) {
     EXPECT_TRUE(regloc.IsSame());
   }
-  if (row_sp->GetRegisterInfo(fpu_d12_arm64, regloc)) {
+  if (row->GetRegisterInfo(fpu_d12_arm64, regloc)) {
     EXPECT_TRUE(regloc.IsSame());
   }
-  if (row_sp->GetRegisterInfo(fpu_d13_arm64, regloc)) {
+  if (row->GetRegisterInfo(fpu_d13_arm64, regloc)) {
     EXPECT_TRUE(regloc.IsSame());
   }
-  if (row_sp->GetRegisterInfo(fpu_d14_arm64, regloc)) {
+  if (row->GetRegisterInfo(fpu_d14_arm64, regloc)) {
     EXPECT_TRUE(regloc.IsSame());
   }
-  if (row_sp->GetRegisterInfo(fpu_d15_arm64, regloc)) {
+  if (row->GetRegisterInfo(fpu_d15_arm64, regloc)) {
     EXPECT_TRUE(regloc.IsSame());
   }
-  if (row_sp->GetRegisterInfo(gpr_x27_arm64, regloc)) {
+  if (row->GetRegisterInfo(gpr_x27_arm64, regloc)) {
     EXPECT_TRUE(regloc.IsSame());
   }
-  if (row_sp->GetRegisterInfo(gpr_x28_arm64, regloc)) {
+  if (row->GetRegisterInfo(gpr_x28_arm64, regloc)) {
     EXPECT_TRUE(regloc.IsSame());
   }
 }
@@ -702,7 +702,7 @@ TEST_F(TestArm64InstEmulation, TestCFARegisterTrackedAcrossJumps) {
           UnwindAssemblyInstEmulation::CreateInstance(arch)));
   ASSERT_NE(nullptr, engine);
 
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   UnwindPlan::Row::AbstractRegisterLocation regloc;
@@ -764,34 +764,34 @@ TEST_F(TestArm64InstEmulation, TestCFARegisterTrackedAcrossJumps) {
       sample_range, data, sizeof(data), unwind_plan));
 
   // Confirm CFA at mid-func epilogue 'ret' is $sp+0
-  row_sp = unwind_plan.GetRowForFunctionOffset(40);
-  EXPECT_EQ(40ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(40);
+  EXPECT_EQ(40ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
   // After the 'ret', confirm we're back to the correct CFA of $fp+16
-  row_sp = unwind_plan.GetRowForFunctionOffset(44);
-  EXPECT_EQ(44ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(44);
+  EXPECT_EQ(44ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
   // Confirm that we have no additional UnwindPlan rows before the 
   // real epilogue -- we still get the Row at offset 44.
-  row_sp = unwind_plan.GetRowForFunctionOffset(60);
-  EXPECT_EQ(44ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(60);
+  EXPECT_EQ(44ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
   // And in the epilogue, confirm that we start by switching back to 
   // defining the CFA in terms of $sp.
-  row_sp = unwind_plan.GetRowForFunctionOffset(64);
-  EXPECT_EQ(64ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(32, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(64);
+  EXPECT_EQ(64ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(32, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(TestArm64InstEmulation, TestCFAResetToSP) {
@@ -801,7 +801,7 @@ TEST_F(TestArm64InstEmulation, TestCFAResetToSP) {
           UnwindAssemblyInstEmulation::CreateInstance(arch)));
   ASSERT_NE(nullptr, engine);
 
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   UnwindPlan::Row::AbstractRegisterLocation regloc;
@@ -844,15 +844,15 @@ TEST_F(TestArm64InstEmulation, TestCFAResetToSP) {
       sample_range, data, sizeof(data), unwind_plan));
 
   // Confirm CFA before epilogue instructions is in terms of $fp
-  row_sp = unwind_plan.GetRowForFunctionOffset(12);
-  EXPECT_EQ(12ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
+  row = unwind_plan.GetRowForFunctionOffset(12);
+  EXPECT_EQ(12ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_fp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
 
   // Confirm that after restoring $fp to caller's value, CFA is now in
   // terms of $sp
-  row_sp = unwind_plan.GetRowForFunctionOffset(16);
-  EXPECT_EQ(16ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
+  row = unwind_plan.GetRowForFunctionOffset(16);
+  EXPECT_EQ(16ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
 }
diff --git a/lldb/unittests/UnwindAssembly/PPC64/TestPPC64InstEmulation.cpp b/lldb/unittests/UnwindAssembly/PPC64/TestPPC64InstEmulation.cpp
index a85aad7..5eb6f2d 100644
--- a/lldb/unittests/UnwindAssembly/PPC64/TestPPC64InstEmulation.cpp
+++ b/lldb/unittests/UnwindAssembly/PPC64/TestPPC64InstEmulation.cpp
@@ -58,7 +58,7 @@ TEST_F(TestPPC64InstEmulation, TestSimpleFunction) {
           UnwindAssemblyInstEmulation::CreateInstance(arch)));
   ASSERT_NE(nullptr, engine);
 
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   UnwindPlan::Row::AbstractRegisterLocation regloc;
@@ -96,76 +96,76 @@ TEST_F(TestPPC64InstEmulation, TestSimpleFunction) {
       sample_range, data, sizeof(data), unwind_plan));
 
   // 0: CFA=sp+0
-  row_sp = unwind_plan.GetRowForFunctionOffset(0);
-  EXPECT_EQ(0ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(0);
+  EXPECT_EQ(0ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
   // 1: CFA=sp+0 => fp=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(16);
-  EXPECT_EQ(16ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(16);
+  EXPECT_EQ(16ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_r31_ppc64le, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_r31_ppc64le, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 2: CFA=sp+0 => fp=[CFA-8] lr=[CFA+16]
-  row_sp = unwind_plan.GetRowForFunctionOffset(20);
-  EXPECT_EQ(20ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(20);
+  EXPECT_EQ(20ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_ppc64le, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_ppc64le, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(16, regloc.GetOffset());
 
   // 3: CFA=sp+112 => fp=[CFA-8] lr=[CFA+16]
-  row_sp = unwind_plan.GetRowForFunctionOffset(24);
-  EXPECT_EQ(24ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(112, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(24);
+  EXPECT_EQ(24ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(112, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_r31_ppc64le, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_r31_ppc64le, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_ppc64le, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_ppc64le, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(16, regloc.GetOffset());
 
   // 4: CFA=r31+112 => fp=[CFA-8] lr=[CFA+16]
-  row_sp = unwind_plan.GetRowForFunctionOffset(28);
-  EXPECT_EQ(28ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r31_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(112, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(28);
+  EXPECT_EQ(28ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r31_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(112, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_r31_ppc64le, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_r31_ppc64le, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_ppc64le, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_ppc64le, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(16, regloc.GetOffset());
 
   // 5: CFA=sp+0 => fp=[CFA-8] lr=[CFA+16]
-  row_sp = unwind_plan.GetRowForFunctionOffset(40);
-  EXPECT_EQ(40ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(40);
+  EXPECT_EQ(40ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_r31_ppc64le, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_r31_ppc64le, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_ppc64le, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_ppc64le, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(16, regloc.GetOffset());
 }
@@ -177,7 +177,7 @@ TEST_F(TestPPC64InstEmulation, TestMediumFunction) {
           UnwindAssemblyInstEmulation::CreateInstance(arch)));
   ASSERT_NE(nullptr, engine);
 
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   UnwindPlan::Row::AbstractRegisterLocation regloc;
@@ -208,51 +208,51 @@ TEST_F(TestPPC64InstEmulation, TestMediumFunction) {
       sample_range, data, sizeof(data), unwind_plan));
 
   // 0: CFA=sp+0
-  row_sp = unwind_plan.GetRowForFunctionOffset(0);
-  EXPECT_EQ(0ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(0);
+  EXPECT_EQ(0ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
   // 1: CFA=sp+0 => fp=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(8);
-  EXPECT_EQ(8ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(8);
+  EXPECT_EQ(8ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_r31_ppc64le, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_r31_ppc64le, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 2: CFA=sp+0 => fp=[CFA-8] lr=[CFA+16]
-  row_sp = unwind_plan.GetRowForFunctionOffset(12);
-  EXPECT_EQ(12ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(12);
+  EXPECT_EQ(12ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(gpr_lr_ppc64le, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(gpr_lr_ppc64le, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(16, regloc.GetOffset());
 
   // 3: CFA=r30
-  row_sp = unwind_plan.GetRowForFunctionOffset(16);
-  EXPECT_EQ(16ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r30_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
-
-  row_sp = unwind_plan.GetRowForFunctionOffset(32);
-  EXPECT_EQ(16ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r30_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(16);
+  EXPECT_EQ(16ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r30_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
+
+  row = unwind_plan.GetRowForFunctionOffset(32);
+  EXPECT_EQ(16ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r30_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 
   // 4: CFA=sp+0
-  row_sp = unwind_plan.GetRowForFunctionOffset(36);
-  EXPECT_EQ(36ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(0, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(36);
+  EXPECT_EQ(36ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_r1_ppc64le);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(0, row->GetCFAValue().GetOffset());
 }
diff --git a/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp b/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp
index f5bb3b6..3600360 100644
--- a/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp
+++ b/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp
@@ -171,46 +171,46 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimple64bitFrameFunction) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
 
   // 0: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8]
-  UnwindPlan::RowSP row_sp = unwind_plan.GetRowForFunctionOffset(0);
-  EXPECT_EQ(0ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  const UnwindPlan::Row *row = unwind_plan.GetRowForFunctionOffset(0);
+  EXPECT_EQ(0ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 1: CFA=rsp+16 => rbp=[CFA-16] rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(1);
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 4: CFA=rbp+16 => rbp=[CFA-16] rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 7: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(7);
-  EXPECT_EQ(7ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(7);
+  EXPECT_EQ(7ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 }
@@ -247,46 +247,46 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimple32bitFrameFunction) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
 
   // offset 0 -- pushl %ebp
-  UnwindPlan::RowSP row_sp = unwind_plan.GetRowForFunctionOffset(0);
-  EXPECT_EQ(0ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4, row_sp->GetCFAValue().GetOffset());
+  const UnwindPlan::Row *row = unwind_plan.GetRowForFunctionOffset(0);
+  EXPECT_EQ(0ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_eip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_eip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_TRUE(regloc.GetOffset() == -4);
 
   // 1: CFA=esp +8 => ebp=[CFA-8] esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(1);
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_eip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_eip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-4, regloc.GetOffset());
 
   // 3: CFA=ebp +8 => ebp=[CFA-8] esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(3);
-  EXPECT_EQ(3ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_ebp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(3);
+  EXPECT_EQ(3ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_ebp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_eip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_eip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-4, regloc.GetOffset());
 
   // 6: CFA=esp +4 => esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(6);
-  EXPECT_EQ(6ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(6);
+  EXPECT_EQ(6ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_eip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_eip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-4, regloc.GetOffset());
 }
@@ -387,52 +387,52 @@ TEST_F(Testx86AssemblyInspectionEngine, Test64bitFramelessBigStackFrame) {
   // 17: CFA=rsp+14496 => rbx=[CFA-56] rbp=[CFA-16] rsp=CFA+0 r12=[CFA-48]
   // r13=[CFA-40] r14=[CFA-32] r15=[CFA-24] rip=[CFA-8]
 
-  UnwindPlan::RowSP row_sp = unwind_plan.GetRowForFunctionOffset(17);
+  const UnwindPlan::Row *row = unwind_plan.GetRowForFunctionOffset(17);
 
-  EXPECT_EQ(17ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(14496, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(17ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(14496, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rbp, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rbp, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_r15, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_r15, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-24, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_r14, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_r14, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-32, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_r13, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_r13, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-40, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_r12, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_r12, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-48, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rbx, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rbx, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-56, regloc.GetOffset());
 
   // grab the Row for when the epilogue has finished executing:
   // 34: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8]
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(34);
+  row = unwind_plan.GetRowForFunctionOffset(34);
 
-  EXPECT_EQ(34ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(34ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
@@ -440,21 +440,21 @@ TEST_F(Testx86AssemblyInspectionEngine, Test64bitFramelessBigStackFrame) {
   // register value is the same as the caller's -- but I'd rather
   // they not be mentioned at all.
 
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rax, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rbx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rcx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rdx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rbp, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rsi, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rdi, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r8, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r9, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r10, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r11, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r12, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r13, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r14, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r15, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rax, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rbx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rcx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rdx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rbp, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rsi, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rdi, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r8, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r9, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r10, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r11, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r12, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r13, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r14, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r15, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, Test32bitFramelessBigStackFrame) {
@@ -651,49 +651,49 @@ TEST_F(Testx86AssemblyInspectionEngine, Test32bitFramelessBigStackFrame) {
   //  esp=CFA+0 eip=[CFA-4]
 
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   // Check that we get the CFA correct for the pic base setup sequence
 
   // CFA=esp+14464 => ebx=[CFA-12] edi=[CFA-16] esi=[CFA-20] ebp=[CFA-8]
   // esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(10);
-  EXPECT_EQ(10ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(14464, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(10);
+  EXPECT_EQ(10ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(14464, row->GetCFAValue().GetOffset());
 
   // 15: CFA=esp+14468 => ebx=[CFA-12] edi=[CFA-16] esi=[CFA-20] ebp=[CFA-8]
   // esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(15);
-  EXPECT_EQ(15ull, row_sp->GetOffset());
-  EXPECT_EQ(14468, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(15);
+  EXPECT_EQ(15ull, row->GetOffset());
+  EXPECT_EQ(14468, row->GetCFAValue().GetOffset());
 
   // 16: CFA=esp+14464 => ebx=[CFA-12] edi=[CFA-16] esi=[CFA-20] ebp=[CFA-8]
   // esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(16);
-  EXPECT_EQ(16ull, row_sp->GetOffset());
-  EXPECT_EQ(14464, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(16);
+  EXPECT_EQ(16ull, row->GetOffset());
+  EXPECT_EQ(14464, row->GetCFAValue().GetOffset());
 
   // Check that the row for offset 16 has the registers saved that we expect
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_eip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_eip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-4, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_ebp, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_ebp, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_ebx, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_ebx, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-12, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_edi, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_edi, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_esi, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_esi, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-20, regloc.GetOffset());
 
@@ -702,56 +702,56 @@ TEST_F(Testx86AssemblyInspectionEngine, Test32bitFramelessBigStackFrame) {
 
   // 23: CFA=esp+14472 => ebx=[CFA-12] edi=[CFA-16] esi=[CFA-20] ebp=[CFA-8]
   // esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(23);
-  EXPECT_EQ(23ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(14472, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(23);
+  EXPECT_EQ(23ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(14472, row->GetCFAValue().GetOffset());
 
   // 24: CFA=esp+14476 => ebx=[CFA-12] edi=[CFA-16] esi=[CFA-20] ebp=[CFA-8]
   // esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(24);
-  EXPECT_EQ(24ull, row_sp->GetOffset());
-  EXPECT_EQ(14476, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(24);
+  EXPECT_EQ(24ull, row->GetOffset());
+  EXPECT_EQ(14476, row->GetCFAValue().GetOffset());
 
   // 28: CFA=esp+14480 => ebx=[CFA-12] edi=[CFA-16] esi=[CFA-20] ebp=[CFA-8]
   // esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(28);
-  EXPECT_EQ(28ull, row_sp->GetOffset());
-  EXPECT_EQ(14480, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(28);
+  EXPECT_EQ(28ull, row->GetOffset());
+  EXPECT_EQ(14480, row->GetCFAValue().GetOffset());
 
   // 36: CFA=esp+14464 => ebx=[CFA-12] edi=[CFA-16] esi=[CFA-20] ebp=[CFA-8]
   // esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(36);
-  EXPECT_EQ(36ull, row_sp->GetOffset());
-  EXPECT_EQ(14464, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(36);
+  EXPECT_EQ(36ull, row->GetOffset());
+  EXPECT_EQ(14464, row->GetCFAValue().GetOffset());
 
   // Check that the epilogue gets us back to the original unwind state
 
   //  47: CFA=esp +4 => esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(47);
-  EXPECT_EQ(47ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(47);
+  EXPECT_EQ(47ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_eip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_eip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-4, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_esp, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_esp, regloc));
   EXPECT_TRUE(regloc.IsCFAPlusOffset());
   EXPECT_EQ(0, regloc.GetOffset());
 
   // Check that no unexpected registers were saved
 
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_eax, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ebx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ecx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_edx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_esi, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_edi, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ebp, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_eax, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_ebx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_ecx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_edx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_esi, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_edi, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_ebp, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, Test64bitFramelessSmallStackFrame) {
@@ -807,46 +807,46 @@ TEST_F(Testx86AssemblyInspectionEngine, Test64bitFramelessSmallStackFrame) {
   // grab the Row for when the prologue has finished executing:
   //     1: CFA=rsp+16 => rsp=CFA+0 rip=[CFA-8]
 
-  UnwindPlan::RowSP row_sp = unwind_plan.GetRowForFunctionOffset(13);
+  const UnwindPlan::Row *row = unwind_plan.GetRowForFunctionOffset(13);
 
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // none of these were spilled
 
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rax, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rbx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rcx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rdx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rbp, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rsi, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rdi, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r8, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r9, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r10, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r11, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r12, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r13, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r14, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r15, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rax, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rbx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rcx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rdx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rbp, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rsi, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rdi, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r8, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r9, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r10, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r11, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r12, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r13, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r14, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_r15, regloc));
 
   // grab the Row for when the epilogue has finished executing:
   //     22: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8]
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(22);
+  row = unwind_plan.GetRowForFunctionOffset(22);
 
-  EXPECT_EQ(22ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(22ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 }
@@ -916,54 +916,54 @@ TEST_F(Testx86AssemblyInspectionEngine, Test32bitFramelessSmallStackFrame) {
   // Check unwind state before we set up the picbase register
   //      3: CFA=esp+16 => esp=CFA+0 eip=[CFA-4]
 
-  UnwindPlan::RowSP row_sp = unwind_plan.GetRowForFunctionOffset(3);
+  const UnwindPlan::Row *row = unwind_plan.GetRowForFunctionOffset(3);
 
-  EXPECT_EQ(3ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(3ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
   // Check unwind state after we call the next instruction
   // 8: CFA=esp+20 => esp=CFA+0 eip=[CFA-4]
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(8);
-  EXPECT_EQ(8ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(20, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(8);
+  EXPECT_EQ(8ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(20, row->GetCFAValue().GetOffset());
 
   // Check unwind state after we pop the pic base value off the stack
   // row[3]:    9: CFA=esp+16 => esp=CFA+0 eip=[CFA-4]
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(9);
-  EXPECT_EQ(9ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(9);
+  EXPECT_EQ(9ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
   // Check that no unexpected registers were saved
 
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_eax, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ebx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ecx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_edx, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_esi, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_edi, regloc));
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ebp, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_eax, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_ebx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_ecx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_edx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_esi, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_edi, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_ebp, regloc));
 
   // verify that we get back to the original unwind state before the ret
   //  34: CFA=esp +4 => esp=CFA+0 eip=[CFA-4]
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(34);
-  EXPECT_EQ(34ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(34);
+  EXPECT_EQ(34ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushRBP) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data[] = {
       0x55, // pushq %rbp
@@ -977,14 +977,14 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushRBP) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
+  row = unwind_plan.GetRowForFunctionOffset(1);
 
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rbp, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rbp, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 
@@ -992,21 +992,21 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushRBP) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
+  row = unwind_plan.GetRowForFunctionOffset(1);
 
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rbp, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rbp, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushImm) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data[] = {
       0x68, 0xff, 0xff, 0x01, 0x69, // pushq $0x6901ffff
@@ -1021,33 +1021,33 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushImm) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(5);
-  EXPECT_EQ(5ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(5);
+  EXPECT_EQ(5ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(7);
-  EXPECT_EQ(7ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(24, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(7);
+  EXPECT_EQ(7ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(24, row->GetCFAValue().GetOffset());
 
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(5);
-  EXPECT_EQ(5ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-
-  row_sp = unwind_plan.GetRowForFunctionOffset(7);
-  EXPECT_EQ(7ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(12, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(5);
+  EXPECT_EQ(5ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+
+  row = unwind_plan.GetRowForFunctionOffset(7);
+  EXPECT_EQ(7ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(12, row->GetCFAValue().GetOffset());
 }
 
 // We treat 'pushq $0' / 'pushl $0' specially - this shows up
@@ -1055,7 +1055,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushImm) {
 // put a 0 as the saved pc.  We pretend it didn't change the CFA.
 TEST_F(Testx86AssemblyInspectionEngine, TestPush0) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data[] = {
       0x6a, 0x00, // pushq $0
@@ -1069,24 +1069,24 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPush0) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
+  row = unwind_plan.GetRowForFunctionOffset(2);
 
   // We're verifying that no row was created for the 'pushq $0'
-  EXPECT_EQ(0ull, row_sp->GetOffset());
+  EXPECT_EQ(0ull, row->GetOffset());
 
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
+  row = unwind_plan.GetRowForFunctionOffset(2);
 
   // We're verifying that no row was created for the 'pushq $0'
-  EXPECT_EQ(0ull, row_sp->GetOffset());
+  EXPECT_EQ(0ull, row->GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushExtended) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data[] = {
       0xff, 0x74, 0x24, 0x20,             // pushl 0x20(%esp)
@@ -1102,39 +1102,39 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushExtended) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
+  row = unwind_plan.GetRowForFunctionOffset(4);
 
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-
-  row_sp = unwind_plan.GetRowForFunctionOffset(10);
-  EXPECT_EQ(10ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(12, row_sp->GetCFAValue().GetOffset());
-
-  row_sp = unwind_plan.GetRowForFunctionOffset(12);
-  EXPECT_EQ(12ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+
+  row = unwind_plan.GetRowForFunctionOffset(10);
+  EXPECT_EQ(10ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(12, row->GetCFAValue().GetOffset());
+
+  row = unwind_plan.GetRowForFunctionOffset(12);
+  EXPECT_EQ(12ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushR15) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data[] = {
       0x41, 0x57, // pushq %r15
@@ -1148,21 +1148,21 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushR15) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
+  row = unwind_plan.GetRowForFunctionOffset(2);
 
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_r15, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_r15, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushR14) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data[] = {
       0x41, 0x56, // pushq %r14
@@ -1176,21 +1176,21 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushR14) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
+  row = unwind_plan.GetRowForFunctionOffset(2);
 
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_r14, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_r14, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushR13) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data[] = {
       0x41, 0x55, // pushq %r13
@@ -1204,21 +1204,21 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushR13) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
+  row = unwind_plan.GetRowForFunctionOffset(2);
 
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_r13, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_r13, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushR12) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data[] = {
       0x41, 0x54, // pushq %r13
@@ -1232,21 +1232,21 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushR12) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
+  row = unwind_plan.GetRowForFunctionOffset(2);
 
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_r12, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_r12, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushRBX) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data[] = {
       0x53, // pushq %rbx
@@ -1260,14 +1260,14 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushRBX) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
+  row = unwind_plan.GetRowForFunctionOffset(1);
 
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rbx, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rbx, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 }
@@ -1277,7 +1277,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushRBX) {
 // not tracked (except to keep track of stack pointer movement)
 TEST_F(Testx86AssemblyInspectionEngine, TestPushEAX) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -1292,13 +1292,13 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEAX) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(1);
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_eax, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_eax, regloc));
 }
 
 // The ABI is hardcoded in x86AssemblyInspectionEngine such that
@@ -1306,7 +1306,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEAX) {
 // not tracked (except to keep track of stack pointer movement)
 TEST_F(Testx86AssemblyInspectionEngine, TestPushECX) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -1321,13 +1321,13 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushECX) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(1);
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ecx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_ecx, regloc));
 }
 
 // The ABI is hardcoded in x86AssemblyInspectionEngine such that
@@ -1335,7 +1335,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushECX) {
 // not tracked (except to keep track of stack pointer movement)
 TEST_F(Testx86AssemblyInspectionEngine, TestPushEDX) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -1350,18 +1350,18 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEDX) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(1);
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_edx, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_edx, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushEBX) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -1376,20 +1376,20 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEBX) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(1);
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_ebx, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_ebx, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushEBP) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -1404,20 +1404,20 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEBP) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(1);
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_ebp, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_ebp, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushRBPWithREX) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data[] = {
       0x40, 0x55, // pushq %rbp
@@ -1431,21 +1431,21 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushRBPWithREX) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
+  row = unwind_plan.GetRowForFunctionOffset(2);
 
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rbp, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rbp, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-16, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushESI) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -1460,20 +1460,20 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushESI) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(1);
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_esi, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_esi, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPushEDI) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -1488,20 +1488,20 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPushEDI) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(1);
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_edi, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_edi, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestMovRSPtoRBP) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
 
   uint8_t data64_1[] = {
       0x48, 0x8b, 0xec, // movq %rsp, %rbp
@@ -1515,12 +1515,12 @@ TEST_F(Testx86AssemblyInspectionEngine, TestMovRSPtoRBP) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data64_1, sizeof(data64_1), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(3);
+  row = unwind_plan.GetRowForFunctionOffset(3);
 
-  EXPECT_EQ(3ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  EXPECT_EQ(3ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
   uint8_t data64_2[] = {
       0x48, 0x89, 0xe5, // movq %rsp, %rbp
@@ -1532,11 +1532,11 @@ TEST_F(Testx86AssemblyInspectionEngine, TestMovRSPtoRBP) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data64_2, sizeof(data64_2), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(3);
-  EXPECT_EQ(3ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(3);
+  EXPECT_EQ(3ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
   uint8_t data32_1[] = {
       0x8b, 0xec, // movl %rsp, %rbp
@@ -1548,11 +1548,11 @@ TEST_F(Testx86AssemblyInspectionEngine, TestMovRSPtoRBP) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data32_1, sizeof(data32_1), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_ebp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_ebp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
   uint8_t data32_2[] = {
       0x89, 0xe5, // movl %rsp, %rbp
@@ -1564,16 +1564,16 @@ TEST_F(Testx86AssemblyInspectionEngine, TestMovRSPtoRBP) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data32_2, sizeof(data32_2), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_ebp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_ebp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestSubRSP) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine64 = Getx86_64Inspector();
@@ -1588,11 +1588,11 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSubRSP) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data1, sizeof(data1), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(7);
-  EXPECT_EQ(7ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(264, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(7);
+  EXPECT_EQ(7ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(264, row->GetCFAValue().GetOffset());
 
   uint8_t data2[] = {
       0x48, 0x83, 0xec, 0x10, // subq $0x10, %rsp
@@ -1604,16 +1604,16 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSubRSP) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data2, sizeof(data2), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(24, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(24, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestSubESP) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -1628,11 +1628,11 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSubESP) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data1, sizeof(data1), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(6);
-  EXPECT_EQ(6ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(260, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(6);
+  EXPECT_EQ(6ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(260, row->GetCFAValue().GetOffset());
 
   uint8_t data2[] = {
       0x83, 0xec, 0x10, // subq $0x10, %esp
@@ -1644,16 +1644,16 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSubESP) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data2, sizeof(data2), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(3);
-  EXPECT_EQ(3ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(20, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(3);
+  EXPECT_EQ(3ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(20, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestAddRSP) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine64 = Getx86_64Inspector();
@@ -1668,11 +1668,11 @@ TEST_F(Testx86AssemblyInspectionEngine, TestAddRSP) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data1, sizeof(data1), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(7);
-  EXPECT_EQ(7ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8 - 256, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(7);
+  EXPECT_EQ(7ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8 - 256, row->GetCFAValue().GetOffset());
 
   uint8_t data2[] = {
       0x48, 0x83, 0xc4, 0x10, // addq $0x10, %rsp
@@ -1684,16 +1684,16 @@ TEST_F(Testx86AssemblyInspectionEngine, TestAddRSP) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data2, sizeof(data2), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8 - 16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8 - 16, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestAddESP) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -1708,11 +1708,11 @@ TEST_F(Testx86AssemblyInspectionEngine, TestAddESP) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data1, sizeof(data1), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(6);
-  EXPECT_EQ(6ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4 - 256, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(6);
+  EXPECT_EQ(6ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4 - 256, row->GetCFAValue().GetOffset());
 
   uint8_t data2[] = {
       0x83, 0xc4, 0x10, // addq $0x10, %esp
@@ -1724,16 +1724,16 @@ TEST_F(Testx86AssemblyInspectionEngine, TestAddESP) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data2, sizeof(data2), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(3);
-  EXPECT_EQ(3ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4 - 16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(3);
+  EXPECT_EQ(3ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4 - 16, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestLEA_RSP_Pattern) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
@@ -1748,16 +1748,16 @@ TEST_F(Testx86AssemblyInspectionEngine, TestLEA_RSP_Pattern) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(0);
-  EXPECT_EQ(0ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(0);
+  EXPECT_EQ(0ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopRBX) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
@@ -1773,17 +1773,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopRBX) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rbx, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_rbx, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopRBP) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
@@ -1799,17 +1799,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopRBP) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rbp, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_rbp, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopR12) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
@@ -1825,17 +1825,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopR12) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r12, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_r12, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopR13) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
@@ -1851,17 +1851,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopR13) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r13, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_r13, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopR14) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
@@ -1877,17 +1877,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopR14) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r14, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_r14, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopR15) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
@@ -1903,17 +1903,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopR15) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_r15, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_r15, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopEBX) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Geti386Inspector();
@@ -1929,17 +1929,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopEBX) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ebx, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_ebx, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopEBP) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Geti386Inspector();
@@ -1955,17 +1955,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopEBP) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ebp, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_ebp, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopRBPWithREX) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
@@ -1981,17 +1981,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopRBPWithREX) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rbp, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_rbp, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopESI) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Geti386Inspector();
@@ -2007,17 +2007,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopESI) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_esi, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_esi, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopEDI) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Geti386Inspector();
@@ -2033,19 +2033,19 @@ TEST_F(Testx86AssemblyInspectionEngine, TestPopEDI) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_edi, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_edi, regloc));
 }
 
 // We don't track these registers, but make sure the CFA address is updated
 // if we're defining the CFA in term of esp.
 TEST_F(Testx86AssemblyInspectionEngine, Testi386IgnoredRegisters) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine = Geti386Inspector();
@@ -2068,22 +2068,22 @@ TEST_F(Testx86AssemblyInspectionEngine, Testi386IgnoredRegisters) {
   EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(20, row_sp->GetCFAValue().GetOffset());
-
-  row_sp = unwind_plan.GetRowForFunctionOffset(7);
-  EXPECT_EQ(7ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(20, row->GetCFAValue().GetOffset());
+
+  row = unwind_plan.GetRowForFunctionOffset(7);
+  EXPECT_EQ(7ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestLEAVE) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine64 = Getx86_64Inspector();
@@ -2100,22 +2100,22 @@ TEST_F(Testx86AssemblyInspectionEngine, TestLEAVE) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rbp, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_rbp, regloc));
 
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(4, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ebp, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(4, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_ebp, regloc));
 }
 
 // In i386, which lacks pc-relative addressing, a common code sequence
@@ -2124,7 +2124,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestLEAVE) {
 // into a register (the "pic base" register).
 TEST_F(Testx86AssemblyInspectionEngine, TestCALLNextInsn) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -2139,17 +2139,17 @@ TEST_F(Testx86AssemblyInspectionEngine, TestCALLNextInsn) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(5);
-  EXPECT_EQ(5ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_ebp, regloc));
+  row = unwind_plan.GetRowForFunctionOffset(5);
+  EXPECT_EQ(5ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
+  EXPECT_FALSE(row->GetRegisterInfo(k_ebp, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestSpillRegToStackViaMOVx86_64) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine64 = Getx86_64Inspector();
@@ -2168,27 +2168,27 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSpillRegToStackViaMOVx86_64) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(19);
-  EXPECT_EQ(19ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(19);
+  EXPECT_EQ(19ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_r14, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_r14, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-80, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_r15, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_r15, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-1512, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rbx, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rbx, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-88, regloc.GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestSpillRegToStackViaMOVi386) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -2206,16 +2206,16 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSpillRegToStackViaMOVi386) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(12);
-  EXPECT_EQ(12ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(12);
+  EXPECT_EQ(12ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_ebx, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_ebx, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-344, regloc.GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_esi, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_esi, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-40, regloc.GetOffset());
 }
@@ -2290,25 +2290,25 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSpArithx86_64Augmented) {
 
   // Before we touch the stack pointer, we should still refer to the
   // row from after the prologue.
-  row_sp = unwind_plan.GetRowForFunctionOffset(5);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
+  const UnwindPlan::Row *row = unwind_plan.GetRowForFunctionOffset(5);
+  EXPECT_EQ(4ull, row->GetOffset());
 
   // Check the first stack pointer update.
-  row_sp = unwind_plan.GetRowForFunctionOffset(12);
-  EXPECT_EQ(12ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_EQ(152, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(12);
+  EXPECT_EQ(12ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_EQ(152, row->GetCFAValue().GetOffset());
 
   // After the nop, we should still refer to the same row.
-  row_sp = unwind_plan.GetRowForFunctionOffset(13);
-  EXPECT_EQ(12ull, row_sp->GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(13);
+  EXPECT_EQ(12ull, row->GetOffset());
 
   // Check that the second stack pointer update is reflected in the
   // unwind plan.
-  row_sp = unwind_plan.GetRowForFunctionOffset(20);
-  EXPECT_EQ(20ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(20);
+  EXPECT_EQ(20ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestSimplex86_64Augmented) {
@@ -2375,10 +2375,10 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimplex86_64Augmented) {
   EXPECT_TRUE(engine64->AugmentUnwindPlanFromCallSite(
       data, sizeof(data), sample_range, unwind_plan, reg_ctx_sp));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(6);
-  EXPECT_EQ(6ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  const UnwindPlan::Row *row = unwind_plan.GetRowForFunctionOffset(6);
+  EXPECT_EQ(6ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
   // x86AssemblyInspectionEngine::AugmentUnwindPlanFromCallSite
   // doesn't track register restores (pop'ing a reg value back from
@@ -2386,7 +2386,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimplex86_64Augmented) {
   // Technically we should be able to do the following test, but it
   // won't work today - the unwind plan will still say that the caller's
   // rbp is on the stack.
-  // EXPECT_FALSE(row_sp->GetRegisterInfo(k_rbp, regloc));
+  // EXPECT_FALSE(row->GetRegisterInfo(k_rbp, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestSimplei386ugmented) {
@@ -2453,10 +2453,10 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimplei386ugmented) {
   EXPECT_TRUE(engine32->AugmentUnwindPlanFromCallSite(
       data, sizeof(data), sample_range, unwind_plan, reg_ctx_sp));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(5);
-  EXPECT_EQ(5ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_EQ(4, row_sp->GetCFAValue().GetOffset());
+  const UnwindPlan::Row *row = unwind_plan.GetRowForFunctionOffset(5);
+  EXPECT_EQ(5ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_EQ(4, row->GetCFAValue().GetOffset());
 
   // x86AssemblyInspectionEngine::AugmentUnwindPlanFromCallSite
   // doesn't track register restores (pop'ing a reg value back from
@@ -2464,7 +2464,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimplei386ugmented) {
   // Technically we should be able to do the following test, but it
   // won't work today - the unwind plan will still say that the caller's
   // ebp is on the stack.
-  // EXPECT_FALSE(row_sp->GetRegisterInfo(k_ebp, regloc));
+  // EXPECT_FALSE(row->GetRegisterInfo(k_ebp, regloc));
 }
 
 // Check that the i386 disassembler disassembles past an opcode that
@@ -2473,7 +2473,7 @@ TEST_F(Testx86AssemblyInspectionEngine, TestSimplei386ugmented) {
 // disassembling at that point (long-mode).
 TEST_F(Testx86AssemblyInspectionEngine, Test32BitOnlyInstruction) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
-  UnwindPlan::RowSP row_sp;
+  const UnwindPlan::Row *row;
   AddressRange sample_range;
   UnwindPlan unwind_plan(eRegisterKindLLDB);
   std::unique_ptr<x86AssemblyInspectionEngine> engine32 = Geti386Inspector();
@@ -2490,13 +2490,13 @@ TEST_F(Testx86AssemblyInspectionEngine, Test32BitOnlyInstruction) {
   EXPECT_TRUE(engine32->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(2ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(2ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_ebp, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_ebp, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
@@ -2505,13 +2505,13 @@ TEST_F(Testx86AssemblyInspectionEngine, Test32BitOnlyInstruction) {
   EXPECT_TRUE(engine64->GetNonCallSiteUnwindPlanFromAssembly(
       data, sizeof(data), sample_range, unwind_plan));
 
-  row_sp = unwind_plan.GetRowForFunctionOffset(2);
-  EXPECT_EQ(0ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(2);
+  EXPECT_EQ(0ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_FALSE(row_sp->GetRegisterInfo(k_rbp, regloc));
+  EXPECT_FALSE(row->GetRegisterInfo(k_rbp, regloc));
 }
 
 TEST_F(Testx86AssemblyInspectionEngine, TestStackRealign8BitDisp_i386) {
@@ -2715,123 +2715,123 @@ TEST_F(Testx86AssemblyInspectionEngine, TestReturnDetect) {
   UnwindPlan::Row::AbstractRegisterLocation regloc;
 
   //  0: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8]
-  UnwindPlan::RowSP row_sp = unwind_plan.GetRowForFunctionOffset(0);
-  EXPECT_EQ(0ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  const UnwindPlan::Row *row = unwind_plan.GetRowForFunctionOffset(0);
+  EXPECT_EQ(0ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   //  1: CFA=rsp+16 => rbp=[CFA-16] rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(1);
-  EXPECT_EQ(1ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(1);
+  EXPECT_EQ(1ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   //  4: CFA=rbp+16 => rbp=[CFA-16] rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(4);
-  EXPECT_EQ(4ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(4);
+  EXPECT_EQ(4ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   //  7: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(7);
-  EXPECT_EQ(7ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(7);
+  EXPECT_EQ(7ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   //  8: CFA=rbp+16 => rbp=[CFA-16] rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(8);
-  EXPECT_EQ(8ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(8);
+  EXPECT_EQ(8ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 11: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(11);
-  EXPECT_EQ(11ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(11);
+  EXPECT_EQ(11ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 12: CFA=rbp+16 => rbp=[CFA-16] rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(12);
-  EXPECT_EQ(12ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(12);
+  EXPECT_EQ(12ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 15: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(15);
-  EXPECT_EQ(15ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(15);
+  EXPECT_EQ(15ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 18: CFA=rbp+16 => rbp=[CFA-16] rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(18);
-  EXPECT_EQ(18ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(18);
+  EXPECT_EQ(18ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 21: CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(21);
-  EXPECT_EQ(21ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(21);
+  EXPECT_EQ(21ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 
   // 24: CFA=rbp+16 => rbp=[CFA-16] rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(24);
-  EXPECT_EQ(24ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(16, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(24);
+  EXPECT_EQ(24ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(16, row->GetCFAValue().GetOffset());
 
-  EXPECT_TRUE(row_sp->GetRegisterInfo(k_rip, regloc));
+  EXPECT_TRUE(row->GetRegisterInfo(k_rip, regloc));
   EXPECT_TRUE(regloc.IsAtCFAPlusOffset());
   EXPECT_EQ(-8, regloc.GetOffset());
 }
@@ -2881,39 +2881,38 @@ TEST_F(Testx86AssemblyInspectionEngine, TestDisassemblyMidFunctionEpilogues) {
 
   // Check that we've unwound the stack after the first mid-function epilogue
   // row:   CFA=esp +4 => esp=CFA+0 eip=[CFA-4]
-  UnwindPlan::RowSP row_sp = unwind_plan.GetRowForFunctionOffset(16);
-  EXPECT_EQ(16ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(wordsize, row_sp->GetCFAValue().GetOffset());
+  const UnwindPlan::Row *row = unwind_plan.GetRowForFunctionOffset(16);
+  EXPECT_EQ(16ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(wordsize, row->GetCFAValue().GetOffset());
 
   // Check that we've reinstated the stack frame setup 
   // unwind instructions after a jmpq *%eax
   // row:   CFA=ebp +8 => esp=CFA+0 eip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(18);
-  EXPECT_EQ(18ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_ebp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(wordsize * 2, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(18);
+  EXPECT_EQ(18ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_ebp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(wordsize * 2, row->GetCFAValue().GetOffset());
 
   // Check that we've reinstated the stack frame setup 
   // unwind instructions after a mid-function retq
   // row:   CFA=ebp +8 => esp=CFA+0 eip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(28);
-  EXPECT_EQ(28ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_ebp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(wordsize * 2, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(28);
+  EXPECT_EQ(28ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_ebp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(wordsize * 2, row->GetCFAValue().GetOffset());
 
   // After last instruction in the function, verify that
   // the stack frame has been unwound
   // row:   CFA=esp +4 => esp=CFA+0 eip=[CFA-4]
-  row_sp = unwind_plan.GetRowForFunctionOffset(34);
-  EXPECT_EQ(34ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_esp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(wordsize, row_sp->GetCFAValue().GetOffset());
-
+  row = unwind_plan.GetRowForFunctionOffset(34);
+  EXPECT_EQ(34ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_esp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(wordsize, row->GetCFAValue().GetOffset());
 
   unwind_plan.Clear();
 
@@ -2923,38 +2922,36 @@ TEST_F(Testx86AssemblyInspectionEngine, TestDisassemblyMidFunctionEpilogues) {
 
   // Check that we've unwound the stack after the first mid-function epilogue
   // row:   CFA=rsp +8 => rsp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(16);
-  EXPECT_EQ(16ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(wordsize, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(16);
+  EXPECT_EQ(16ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(wordsize, row->GetCFAValue().GetOffset());
 
   // Check that we've reinstated the stack frame setup 
   // unwind instructions after a jmpq *%eax
   // row:   CFA=rbp+16 => rsp=CFA+0 rip=[CFA-16]
-  row_sp = unwind_plan.GetRowForFunctionOffset(18);
-  EXPECT_EQ(18ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(wordsize * 2, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(18);
+  EXPECT_EQ(18ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(wordsize * 2, row->GetCFAValue().GetOffset());
 
   // Check that we've reinstated the stack frame setup 
   // unwind instructions after a mid-function retq
   // row:   CFA=rbp+16 => rsp=CFA+0 rip=[CFA-16]
-  row_sp = unwind_plan.GetRowForFunctionOffset(28);
-  EXPECT_EQ(28ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rbp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(wordsize * 2, row_sp->GetCFAValue().GetOffset());
+  row = unwind_plan.GetRowForFunctionOffset(28);
+  EXPECT_EQ(28ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rbp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(wordsize * 2, row->GetCFAValue().GetOffset());
 
   // After last instruction in the function, verify that
   // the stack frame has been unwound
   // row:   CFA=rsp +8 => esp=CFA+0 rip=[CFA-8]
-  row_sp = unwind_plan.GetRowForFunctionOffset(34);
-  EXPECT_EQ(34ull, row_sp->GetOffset());
-  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
-  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
-  EXPECT_EQ(wordsize, row_sp->GetCFAValue().GetOffset());
-
-
+  row = unwind_plan.GetRowForFunctionOffset(34);
+  EXPECT_EQ(34ull, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(wordsize, row->GetCFAValue().GetOffset());
 }
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 3c59216..051eaf6 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -468,6 +468,9 @@ The current vendor extensions supported are:
 ``Xmipslsp``
   LLVM implements load/store pair instructions for the `p8700 processor <https://mips.com/products/hardware/p8700/>` by MIPS.
 
+``experimental-XRivosVizip``
+  LLVM implements `version 0.1 of the Rivos Vector Register Zips extension specification <https://github.com/rivosinc/rivos-custom-extensions>`__.
+
 Experimental C Intrinsics
 =========================
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 9792749..3bff205e 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -353,6 +353,7 @@ struct APFloatBase {
   static bool semanticsHasSignedRepr(const fltSemantics &);
   static bool semanticsHasInf(const fltSemantics &);
   static bool semanticsHasNaN(const fltSemantics &);
+  static bool isIEEELikeFP(const fltSemantics &);
 
   // Returns true if any number described by \p Src can be precisely represented
   // by a normal (not subnormal) value in \p Dst.
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index d4b1a9e..d399457 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -166,7 +166,6 @@ public:
   CBufferExtType &operator=(const CBufferExtType &) = delete;
 
   Type *getResourceType() const { return getTypeParameter(0); }
-  uint32_t getCBufferSize() const { return getIntParameter(0); }
 
   static bool classof(const TargetExtType *T) {
     return T->getName() == "dx.CBuffer";
@@ -197,6 +196,27 @@ public:
   }
 };
 
+/// The dx.Layout target extension type
+///
+/// `target("dx.Layout", <Type>, <size>, [offsets...])`
+class LayoutExtType : public TargetExtType {
+public:
+  LayoutExtType() = delete;
+  LayoutExtType(const LayoutExtType &) = delete;
+  LayoutExtType &operator=(const LayoutExtType &) = delete;
+
+  Type *getWrappedType() const { return getTypeParameter(0); }
+  uint32_t getSize() const { return getIntParameter(0); }
+  uint32_t getOffsetOfElement(int I) const { return getIntParameter(I + 1); }
+
+  static bool classof(const TargetExtType *T) {
+    return T->getName() == "dx.Layout";
+  }
+  static bool classof(const Type *T) {
+    return isa<TargetExtType>(T) && classof(cast<TargetExtType>(T));
+  }
+};
+
 //===----------------------------------------------------------------------===//
 
 class ResourceTypeInfo {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/GetDylibInterface.h b/llvm/include/llvm/ExecutionEngine/Orc/GetDylibInterface.h
new file mode 100644
index 0000000..3bdf330
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/GetDylibInterface.h
@@ -0,0 +1,40 @@
+//===---- GetDylibInterface.h - Get interface for real dylib ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Get symbol interface from a real dynamic library or TAPI file. These
+// interfaces can be used to simulate weak linking (ld64 -weak-lx /
+// -weak_library) against a library that is absent at runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_GETDYLIBINTERFACE_H
+#define LLVM_EXECUTIONENGINE_ORC_GETDYLIBINTERFACE_H
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/Object/TapiUniversal.h"
+
+namespace llvm::orc {
+
+/// Returns a SymbolNameSet containing the exported symbols defined in the
+/// given dylib.
+Expected<SymbolNameSet> getDylibInterfaceFromDylib(ExecutionSession &ES,
+                                                   Twine Path);
+
+/// Returns a SymbolNameSet containing the exported symbols defined in the
+/// relevant slice of the TapiUniversal file.
+Expected<SymbolNameSet> getDylibInterfaceFromTapiFile(ExecutionSession &ES,
+                                                      Twine Path);
+
+/// Returns a SymbolNameSet containing the exported symbols defined in the
+/// relevant slice of the given file, which may be either a dylib or a tapi
+/// file.
+Expected<SymbolNameSet> getDylibInterface(ExecutionSession &ES, Twine Path);
+
+} // namespace llvm::orc
+
+#endif // LLVM_EXECUTIONENGINE_ORC_GETDYLIBINTERFACE_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/GetTapiInterface.h b/llvm/include/llvm/ExecutionEngine/Orc/GetTapiInterface.h
deleted file mode 100644
index 36e9438..0000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/GetTapiInterface.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//===---- GetTapiInterface.h -- Get interface from TAPI file ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Get symbol interface from TAPI file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_GETTAPIINTERFACE_H
-#define LLVM_EXECUTIONENGINE_ORC_GETTAPIINTERFACE_H
-
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/Object/TapiUniversal.h"
-
-namespace llvm::orc {
-
-/// Returns a SymbolNameSet containing the exported symbols defined in the
-/// relevant slice of the TapiUniversal file.
-Expected<SymbolNameSet> getInterfaceFromTapiFile(ExecutionSession &ES,
-                                                 object::TapiUniversal &TU);
-
-} // namespace llvm::orc
-
-#endif // LLVM_EXECUTIONENGINE_ORC_GETTAPIINTERFACE_H
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 210acbf..39fd46b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -34,6 +34,7 @@ def OpenMP : DirectiveLanguage {
 
 def OMPC_Absent : Clause<"absent"> {
   let clangClass = "OMPAbsentClause";
+  let flangClass = "OmpAbsentClause";
 }
 def OMPC_Acquire : Clause<"acquire"> {
   let clangClass = "OMPAcquireClause";
@@ -107,6 +108,7 @@ def OMPC_CancellationConstructType : Clause<"cancellation_construct_type"> {
 }
 def OMPC_Contains : Clause<"contains"> {
   let clangClass = "OMPContainsClause";
+  let flangClass = "OmpContainsClause";
 }
 def OMPC_Capture : Clause<"capture"> {
   let clangClass = "OMPCaptureClause";
@@ -225,6 +227,7 @@ def OMPC_Hint : Clause<"hint"> {
 }
 def OMPC_Holds : Clause<"holds"> {
   let clangClass = "OMPHoldsClause";
+  let flangClass = "OmpHoldsClause";
 }
 def OMPC_If : Clause<"if"> {
   let clangClass = "OMPIfClause";
@@ -562,6 +565,14 @@ def OMP_Allocators : Directive<"allocators"> {
 def OMP_Assumes : Directive<"assumes"> {
   let association = AS_None;
   let category = CA_Informational;
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Absent, 51>,
+    VersionedClause<OMPC_Contains, 51>,
+    VersionedClause<OMPC_Holds, 51>,
+    VersionedClause<OMPC_NoOpenMP, 51>,
+    VersionedClause<OMPC_NoOpenMPRoutines, 51>,
+    VersionedClause<OMPC_NoParallelism, 51>,
+  ];
 }
 def OMP_EndAssumes : Directive<"end assumes"> {
   let association = AS_Delimited;
@@ -608,6 +619,14 @@ def OMP_Barrier : Directive<"barrier"> {
 def OMP_BeginAssumes : Directive<"begin assumes"> {
   let association = AS_Delimited;
   let category = CA_Informational;
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Absent, 51>,
+    VersionedClause<OMPC_Contains, 51>,
+    VersionedClause<OMPC_Holds, 51>,
+    VersionedClause<OMPC_NoOpenMP, 51>,
+    VersionedClause<OMPC_NoOpenMPRoutines, 51>,
+    VersionedClause<OMPC_NoParallelism, 51>,
+  ];
 }
 def OMP_BeginDeclareTarget : Directive<"begin declare target"> {
   let allowedClauses = [
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5fc2bd6..80b4aa2 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -728,13 +728,12 @@ public:
                       LoopBodyGenCallbackTy BodyGenCB, Value *TripCount,
                       const Twine &Name = "loop");
 
-  /// Generator for the control flow structure of an OpenMP canonical loop.
+  /// Calculate the trip count of a canonical loop.
   ///
-  /// Instead of a logical iteration space, this allows specifying user-defined
-  /// loop counter values using increment, upper- and lower bounds. To
-  /// disambiguate the terminology when counting downwards, instead of lower
-  /// bounds we use \p Start for the loop counter value in the first body
-  /// iteration.
+  /// This allows specifying user-defined loop counter values using increment,
+  /// upper- and lower bounds. To disambiguate the terminology when counting
+  /// downwards, instead of lower bounds we use \p Start for the loop counter
+  /// value in the first body iteration.
   ///
   /// Consider the following limitations:
   ///
@@ -758,7 +757,32 @@ public:
   ///
   ///      for (int i = 0; i < 42; i -= 1u)
   ///
-  //
+  /// \param Loc       The insert and source location description.
+  /// \param Start     Value of the loop counter for the first iterations.
+  /// \param Stop      Loop counter values past this will stop the loop.
+  /// \param Step      Loop counter increment after each iteration; negative
+  ///                  means counting down.
+  /// \param IsSigned  Whether Start, Stop and Step are signed integers.
+  /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
+  ///                      counter.
+  /// \param Name      Base name used to derive instruction names.
+  ///
+  /// \returns The value holding the calculated trip count.
+  Value *calculateCanonicalLoopTripCount(const LocationDescription &Loc,
+                                         Value *Start, Value *Stop, Value *Step,
+                                         bool IsSigned, bool InclusiveStop,
+                                         const Twine &Name = "loop");
+
+  /// Generator for the control flow structure of an OpenMP canonical loop.
+  ///
+  /// Instead of a logical iteration space, this allows specifying user-defined
+  /// loop counter values using increment, upper- and lower bounds. To
+  /// disambiguate the terminology when counting downwards, instead of lower
+  /// bounds we use \p Start for the loop counter value in the first body
+  ///
+  /// It calls \see calculateCanonicalLoopTripCount for trip count calculations,
+  /// so limitations of that method apply here as well.
+  ///
   /// \param Loc       The insert and source location description.
   /// \param BodyGenCB Callback that will generate the loop body code.
   /// \param Start     Value of the loop counter for the first iterations.
diff --git a/llvm/include/llvm/Support/AlignOf.h b/llvm/include/llvm/Support/AlignOf.h
index f586d7f..4f02e81 100644
--- a/llvm/include/llvm/Support/AlignOf.h
+++ b/llvm/include/llvm/Support/AlignOf.h
@@ -13,20 +13,17 @@
 #ifndef LLVM_SUPPORT_ALIGNOF_H
 #define LLVM_SUPPORT_ALIGNOF_H
 
-#include <type_traits>
+#include <algorithm>
 
 namespace llvm {
 
 /// A suitably aligned and sized character array member which can hold elements
 /// of any type.
-///
-/// This template is equivalent to std::aligned_union_t<1, ...>, but we cannot
-/// use it due to a bug in the MSVC x86 compiler:
-/// https://github.com/microsoft/STL/issues/1533
-/// Using `alignas` here works around the bug.
 template <typename T, typename... Ts> struct AlignedCharArrayUnion {
-  using AlignedUnion = std::aligned_union_t<1, T, Ts...>;
-  alignas(alignof(AlignedUnion)) char buffer[sizeof(AlignedUnion)];
+  // Work around "internal compiler error: Segmentation fault" with GCC 7.5,
+  // apparently caused by alignas(Ts...).
+  static constexpr std::size_t Align = std::max({alignof(T), alignof(Ts)...});
+  alignas(Align) char buffer[std::max({sizeof(T), sizeof(Ts)...})];
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
index 44cb518..390f284 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
@@ -745,8 +745,9 @@ bool SampleProfileLoaderBaseImpl<BT>::propagateThroughEdges(
 
       if (i == 0) {
         // First, visit all predecessor edges.
-        NumTotalEdges = Predecessors[BB].size();
-        for (auto *Pred : Predecessors[BB]) {
+        auto &Preds = Predecessors[BB];
+        NumTotalEdges = Preds.size();
+        for (auto *Pred : Preds) {
           Edge E = std::make_pair(Pred, BB);
           TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
           if (E.first == E.second)
@@ -757,8 +758,9 @@ bool SampleProfileLoaderBaseImpl<BT>::propagateThroughEdges(
         }
       } else {
         // On the second round, visit all successor edges.
-        NumTotalEdges = Successors[BB].size();
-        for (auto *Succ : Successors[BB]) {
+        auto &Succs = Successors[BB];
+        NumTotalEdges = Succs.size();
+        for (auto *Succ : Succs) {
           Edge E = std::make_pair(BB, Succ);
           TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
         }
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 4ffc9db..22afb4c 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -382,7 +382,14 @@ ResourceTypeInfo::UAVInfo ResourceTypeInfo::getUAV() const {
 
 uint32_t ResourceTypeInfo::getCBufferSize(const DataLayout &DL) const {
   assert(isCBuffer() && "Not a CBuffer");
-  return cast<CBufferExtType>(HandleTy)->getCBufferSize();
+
+  Type *ElTy = cast<CBufferExtType>(HandleTy)->getResourceType();
+
+  if (auto *LayoutTy = dyn_cast<LayoutExtType>(ElTy))
+    return LayoutTy->getSize();
+
+  // TODO: What should we do with unannotated arrays?
+  return DL.getTypeAllocSize(ElTy);
 }
 
 dxil::SamplerType ResourceTypeInfo::getSamplerType() const {
@@ -749,6 +756,11 @@ void DXILBindingMap::populate(Module &M, DXILResourceTypeMap &DRTM) {
       NextID = 0;
     }
 
+    // We need to make sure the types of resource are ordered even if some are
+    // missing.
+    FirstCBuffer = std::min({FirstCBuffer, FirstSampler});
+    FirstUAV = std::min({FirstUAV, FirstCBuffer});
+
     // Adjust the resource binding to use the next ID.
     RBI.setBindingID(NextID++);
   }
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 2e8487d..690e92a 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -1036,9 +1036,10 @@ bool LLParser::parseStandaloneMetadata() {
 
     assert(NumberedMetadata[MetadataID] == Init && "Tracking VH didn't work");
   } else {
-    if (NumberedMetadata.count(MetadataID))
+    auto [It, Inserted] = NumberedMetadata.try_emplace(MetadataID);
+    if (!Inserted)
       return tokError("Metadata id is already used");
-    NumberedMetadata[MetadataID].reset(Init);
+    It->second.reset(Init);
   }
 
   return false;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index ddf0275..5bc30fe 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1777,8 +1777,9 @@ DIE *DwarfCompileUnit::getOrCreateContextDIE(const DIScope *Context) {
 
     // Otherwise the context must be a DISubprogram.
     auto *SPScope = cast<DISubprogram>(Context);
-    if (getAbstractScopeDIEs().count(SPScope))
-      return getAbstractScopeDIEs()[SPScope];
+    const auto &DIEs = getAbstractScopeDIEs();
+    if (auto It = DIEs.find(SPScope); It != DIEs.end())
+      return It->second;
   }
   return DwarfUnit::getOrCreateContextDIE(Context);
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp
index 9841c8a..05923e5 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp
@@ -308,9 +308,10 @@ LegacyLegalizerInfo::findScalarLegalAction(const InstrAspect &Aspect) const {
   ArrayRef<SizeAndActionsVec> Actions;
   if (Aspect.Type.isPointer()) {
     auto &PA = AddrSpace2PointerActions[OpcodeIdx];
-    if (PA.find(Aspect.Type.getAddressSpace()) == PA.end())
+    auto It = PA.find(Aspect.Type.getAddressSpace());
+    if (It == PA.end())
       return {NotFound, LLT()};
-    Actions = PA.find(Aspect.Type.getAddressSpace())->second;
+    Actions = It->second;
   } else {
     Actions = ScalarActions[OpcodeIdx];
   }
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 9c8dcac..b35f765 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -978,8 +978,9 @@ void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
     // cost of the most expensive instruction of the group.
     Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
     for (SelectLike &SI : ASI.Selects) {
-      SelectCost = std::max(SelectCost, InstCostMap[SI.getI()].PredCost);
-      BranchCost = std::max(BranchCost, InstCostMap[SI.getI()].NonPredCost);
+      const auto &ICM = InstCostMap[SI.getI()];
+      SelectCost = std::max(SelectCost, ICM.PredCost);
+      BranchCost = std::max(BranchCost, ICM.NonPredCost);
     }
     if (BranchCost < SelectCost) {
       OptimizationRemark OR(DEBUG_TYPE, "SelectOpti",
@@ -1327,8 +1328,8 @@ bool SelectOptimizeImpl::computeLoopCosts(
         // BranchCost = PredictedPathCost + MispredictCost
         // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
         // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
-        if (SImap.contains(&I)) {
-          auto SI = SImap.at(&I);
+        if (auto It = SImap.find(&I); It != SImap.end()) {
+          auto SI = It->second;
           const auto *SG = SGmap.at(&I);
           Scaled64 TrueOpCost = SI.getOpCostOnBranch(true, InstCostMap, TTI);
           Scaled64 FalseOpCost = SI.getOpCostOnBranch(false, InstCostMap, TTI);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a83be13..f197ae6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17271,6 +17271,9 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 // prefer it.
 SDValue DAGCombiner::combineFMulOrFDivWithIntPow2(SDNode *N) {
   EVT VT = N->getValueType(0);
+  if (!APFloat::isIEEELikeFP(VT.getFltSemantics()))
+    return SDValue();
+
   SDValue ConstOp, Pow2Op;
 
   std::optional<int> Mantissa;
@@ -17297,8 +17300,8 @@ SDValue DAGCombiner::combineFMulOrFDivWithIntPow2(SDNode *N) {
 
       const APFloat &APF = CFP->getValueAPF();
 
-      // Make sure we have normal/ieee constant.
-      if (!APF.isNormal() || !APF.isIEEE())
+      // Make sure we have normal constant.
+      if (!APF.isNormal())
         return false;
 
       // Make sure the floats exponent is within the bounds that this transform
@@ -28191,7 +28194,8 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
   if ((Fold || Swap) &&
       TLI.getBooleanContents(CmpOpVT) ==
           TargetLowering::ZeroOrOneBooleanContent &&
-      (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) {
+      (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT)) &&
+      TLI.convertSelectOfConstantsToMath(VT)) {
 
     if (Swap) {
       CC = ISD::getSetCCInverse(CC, CmpOpVT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 0244c17..9fbcb5b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1570,6 +1570,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FREM:
   case ISD::FREM:       ExpandFloatRes_FREM(N, Lo, Hi); break;
   case ISD::FMODF:   ExpandFloatRes_FMODF(N); break;
+  case ISD::FSINCOS: ExpandFloatRes_FSINCOS(N); break;
+  case ISD::FSINCOSPI: ExpandFloatRes_FSINCOSPI(N); break;
     // clang-format on
   }
 
@@ -1625,6 +1627,15 @@ void DAGTypeLegalizer::ExpandFloatRes_FMODF(SDNode *N) {
                                        /*CallRetResNo=*/0);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FSINCOS(SDNode *N) {
+  ExpandFloatRes_UnaryWithTwoFPResults(N, RTLIB::getSINCOS(N->getValueType(0)));
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSINCOSPI(SDNode *N) {
+  ExpandFloatRes_UnaryWithTwoFPResults(N,
+                                       RTLIB::getSINCOSPI(N->getValueType(0)));
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_UnaryWithTwoFPResults(
     SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo) {
   assert(!N->isStrictFPOpcode() && "strictfp not implemented");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cac969f..74d7210 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -718,6 +718,8 @@ private:
   void ExpandFloatRes_LOAD      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FMODF(SDNode *N);
+  void ExpandFloatRes_FSINCOS(SDNode* N);
+  void ExpandFloatRes_FSINCOSPI(SDNode* N);
   // clang-format on
 
   // Float Operand Expansion.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1c58a7f..ea28f72 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3360,16 +3360,28 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
     case Intrinsic::experimental_gc_statepoint:
       LowerStatepoint(cast<GCStatepointInst>(I), EHPadBB);
       break;
+    // wasm_throw, wasm_rethrow: This is usually done in visitTargetIntrinsic,
+    // but these intrinsics are special because they can be invoked, so we
+    // manually lower it to a DAG node here.
+    case Intrinsic::wasm_throw: {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      std::array<SDValue, 4> Ops = {
+          getControlRoot(), // inchain for the terminator node
+          DAG.getTargetConstant(Intrinsic::wasm_throw, getCurSDLoc(),
+                                TLI.getPointerTy(DAG.getDataLayout())),
+          getValue(I.getArgOperand(0)), // tag
+          getValue(I.getArgOperand(1))  // thrown value
+      };
+      SDVTList VTs = DAG.getVTList(ArrayRef<EVT>({MVT::Other})); // outchain
+      DAG.setRoot(DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops));
+      break;
+    }
     case Intrinsic::wasm_rethrow: {
-      // This is usually done in visitTargetIntrinsic, but this intrinsic is
-      // special because it can be invoked, so we manually lower it to a DAG
-      // node here.
-      SmallVector<SDValue, 8> Ops;
-      Ops.push_back(getControlRoot()); // inchain for the terminator node
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      Ops.push_back(
+      std::array<SDValue, 2> Ops = {
+          getControlRoot(), // inchain for the terminator node
           DAG.getTargetConstant(Intrinsic::wasm_rethrow, getCurSDLoc(),
-                                TLI.getPointerTy(DAG.getDataLayout())));
+                                TLI.getPointerTy(DAG.getDataLayout()))};
       SDVTList VTs = DAG.getVTList(ArrayRef<EVT>({MVT::Other})); // outchain
       DAG.setRoot(DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops));
       break;
diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index d18196b..fc98f59 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -201,10 +201,8 @@ bool WasmEHPrepareImpl::prepareThrows(Function &F) {
   // delete all following instructions within the BB, and delete all the dead
   // children of the BB as well.
   for (User *U : ThrowF->users()) {
-    // A call to @llvm.wasm.throw() is only generated from __cxa_throw()
-    // builtin call within libcxxabi, and cannot be an InvokeInst.
-    auto *ThrowI = cast<CallInst>(U);
-    if (ThrowI->getFunction() != &F)
+    auto *ThrowI = dyn_cast<CallInst>(U);
+    if (!ThrowI || ThrowI->getFunction() != &F)
       continue;
     Changed = true;
     auto *BB = ThrowI->getParent();
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
index 6ce9bcbe..8074f1a 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
@@ -721,12 +721,11 @@ Error LVCodeViewReader::traverseSymbolSection(StringRef SectionName,
                                    getFileName());
 
         LLVM_DEBUG({ W.printString("Symbol Name", SymbolName); });
-        if (FunctionLineTables.count(SymbolName) != 0) {
+        if (!FunctionLineTables.try_emplace(SymbolName, Contents).second) {
           // Saw debug info for this function already?
           return createStringError(object_error::parse_failed, getFileName());
         }
 
-        FunctionLineTables[SymbolName] = Contents;
         SymbolNames.push_back(SymbolName);
       }
       break;
diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index eab8ee16..c701a05 100644
--- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -25,7 +25,7 @@ add_llvm_component_library(LLVMOrcJIT
   EPCIndirectionUtils.cpp
   ExecutionUtils.cpp
   ObjectFileInterface.cpp
-  GetTapiInterface.cpp
+  GetDylibInterface.cpp
   IndirectionUtils.cpp
   IRCompileLayer.cpp
   IRTransformLayer.cpp
diff --git a/llvm/lib/ExecutionEngine/Orc/GetDylibInterface.cpp b/llvm/lib/ExecutionEngine/Orc/GetDylibInterface.cpp
new file mode 100644
index 0000000..f49b9fe
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/GetDylibInterface.cpp
@@ -0,0 +1,127 @@
+//===-------- GetDylibInterface.cpp - Get interface for real dylib --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/GetDylibInterface.h"
+
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
+
+#define DEBUG_TYPE "orc"
+
+namespace llvm::orc {
+
+Expected<SymbolNameSet> getDylibInterfaceFromDylib(ExecutionSession &ES,
+                                                   Twine Path) {
+  auto CPUType = MachO::getCPUType(ES.getTargetTriple());
+  if (!CPUType)
+    return CPUType.takeError();
+
+  auto CPUSubType = MachO::getCPUSubType(ES.getTargetTriple());
+  if (!CPUSubType)
+    return CPUSubType.takeError();
+
+  auto Buf = MemoryBuffer::getFile(Path);
+  if (!Buf)
+    return createFileError(Path, Buf.getError());
+
+  auto SymFile =
+      object::SymbolicFile::createSymbolicFile((*Buf)->getMemBufferRef());
+  if (!SymFile)
+    return SymFile.takeError();
+
+  std::unique_ptr<object::MachOObjectFile> MachOFile;
+  if (isa<object::MachOObjectFile>(**SymFile))
+    MachOFile.reset(dyn_cast<object::MachOObjectFile>(SymFile->release()));
+  else if (auto *MachOUni =
+               dyn_cast<object::MachOUniversalBinary>(SymFile->get())) {
+    for (auto &O : MachOUni->objects()) {
+      if (O.getCPUType() == *CPUType && O.getCPUSubType() == *CPUSubType) {
+        if (auto Obj = O.getAsObjectFile())
+          MachOFile = std::move(*Obj);
+        else
+          return Obj.takeError();
+        break;
+      }
+    }
+    if (!MachOFile)
+      return make_error<StringError>("MachO universal binary at " + Path +
+                                         " does not contain a slice for " +
+                                         ES.getTargetTriple().str(),
+                                     inconvertibleErrorCode());
+  } else
+    return make_error<StringError>("File at " + Path + " is not a MachO",
+                                   inconvertibleErrorCode());
+
+  if (MachOFile->getHeader().filetype != MachO::MH_DYLIB)
+    return make_error<StringError>("MachO at " + Path + " is not a dylib",
+                                   inconvertibleErrorCode());
+
+  SymbolNameSet Symbols;
+  for (auto &Sym : MachOFile->symbols()) {
+    if (auto Name = Sym.getName())
+      Symbols.insert(ES.intern(*Name));
+    else
+      return Name.takeError();
+  }
+
+  return std::move(Symbols);
+}
+
+Expected<SymbolNameSet> getDylibInterfaceFromTapiFile(ExecutionSession &ES,
+                                                      Twine Path) {
+  SymbolNameSet Symbols;
+
+  auto TapiFileBuffer = MemoryBuffer::getFile(Path);
+  if (!TapiFileBuffer)
+    return createFileError(Path, TapiFileBuffer.getError());
+
+  auto Tapi =
+      object::TapiUniversal::create((*TapiFileBuffer)->getMemBufferRef());
+  if (!Tapi)
+    return Tapi.takeError();
+
+  auto CPUType = MachO::getCPUType(ES.getTargetTriple());
+  if (!CPUType)
+    return CPUType.takeError();
+
+  auto CPUSubType = MachO::getCPUSubType(ES.getTargetTriple());
+  if (!CPUSubType)
+    return CPUSubType.takeError();
+
+  auto &IF = (*Tapi)->getInterfaceFile();
+  auto Interface =
+      IF.extract(MachO::getArchitectureFromCpuType(*CPUType, *CPUSubType));
+  if (!Interface)
+    return Interface.takeError();
+
+  for (auto *Sym : (*Interface)->exports())
+    Symbols.insert(ES.intern(Sym->getName()));
+
+  return Symbols;
+}
+
+Expected<SymbolNameSet> getDylibInterface(ExecutionSession &ES, Twine Path) {
+  file_magic Magic;
+  if (auto EC = identify_magic(Path, Magic))
+    return createFileError(Path, EC);
+
+  SymbolNameSet Symbols;
+  switch (Magic) {
+  case file_magic::macho_dynamically_linked_shared_lib:
+    return getDylibInterfaceFromDylib(ES, Path);
+  case file_magic::tapi_file:
+    return getDylibInterfaceFromTapiFile(ES, Path);
+  default:
+    return make_error<StringError>("Cannot get interface for " + Path +
+                                       " unrecognized file type",
+                                   inconvertibleErrorCode());
+  }
+}
+
+} // namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/GetTapiInterface.cpp b/llvm/lib/ExecutionEngine/Orc/GetTapiInterface.cpp
deleted file mode 100644
index f74db3e..0000000
--- a/llvm/lib/ExecutionEngine/Orc/GetTapiInterface.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===--------- GetTapiInterface.cpp - Get interface from TAPI file --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/Orc/GetTapiInterface.h"
-
-#define DEBUG_TYPE "orc"
-
-namespace llvm::orc {
-
-Expected<SymbolNameSet> getInterfaceFromTapiFile(ExecutionSession &ES,
-                                                 object::TapiUniversal &TU) {
-  SymbolNameSet Symbols;
-
-  auto CPUType = MachO::getCPUType(ES.getTargetTriple());
-  if (!CPUType)
-    return CPUType.takeError();
-
-  auto CPUSubType = MachO::getCPUSubType(ES.getTargetTriple());
-  if (!CPUSubType)
-    return CPUSubType.takeError();
-
-  auto &TUIF = TU.getInterfaceFile();
-  auto ArchInterface =
-      TUIF.extract(MachO::getArchitectureFromCpuType(*CPUType, *CPUSubType));
-  if (!ArchInterface)
-    return ArchInterface.takeError();
-
-  for (auto *Sym : (*ArchInterface)->exports())
-    Symbols.insert(ES.intern(Sym->getName()));
-
-  return Symbols;
-}
-
-} // namespace llvm::orc
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 1e6a195..dd27c4c 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4059,10 +4059,9 @@ OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
   return CL;
 }
 
-Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
-    const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
-    Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
-    InsertPointTy ComputeIP, const Twine &Name) {
+Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
+    const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
+    bool IsSigned, bool InclusiveStop, const Twine &Name) {
 
   // Consider the following difficulties (assuming 8-bit signed integers):
   //  * Adding \p Step to the loop counter which passes \p Stop may overflow:
@@ -4075,9 +4074,7 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
   assert(IndVarTy == Stop->getType() && "Stop type mismatch");
   assert(IndVarTy == Step->getType() && "Step type mismatch");
 
-  LocationDescription ComputeLoc =
-      ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
-  updateToLocation(ComputeLoc);
+  updateToLocation(Loc);
 
   ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
   ConstantInt *One = ConstantInt::get(IndVarTy, 1);
@@ -4117,8 +4114,20 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
     Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
     CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
   }
-  Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
-                                          "omp_" + Name + ".tripcount");
+
+  return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
+                              "omp_" + Name + ".tripcount");
+}
+
+Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
+    const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
+    Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
+    InsertPointTy ComputeIP, const Twine &Name) {
+  LocationDescription ComputeLoc =
+      ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
+
+  Value *TripCount = calculateCanonicalLoopTripCount(
+      ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
 
   auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
     Builder.restoreIP(CodeGenIP);
@@ -4131,6 +4140,23 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
 }
 
 // Returns an LLVM function to call for initializing loop bounds using OpenMP
+// static scheduling for composite `distribute parallel for` depending on
+// `type`. Only i32 and i64 are supported by the runtime. Always interpret
+// integers as unsigned similarly to CanonicalLoopInfo.
+static FunctionCallee
+getKmpcDistForStaticInitForType(Type *Ty, Module &M,
+                                OpenMPIRBuilder &OMPBuilder) {
+  unsigned Bitwidth = Ty->getIntegerBitWidth();
+  if (Bitwidth == 32)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
+  if (Bitwidth == 64)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
+  llvm_unreachable("unknown OpenMP loop iterator bitwidth");
+}
+
+// Returns an LLVM function to call for initializing loop bounds using OpenMP
 // static scheduling depending on `type`. Only i32 and i64 are supported by the
 // runtime. Always interpret integers as unsigned similarly to
 // CanonicalLoopInfo.
@@ -4164,7 +4190,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
   // Declare useful OpenMP runtime functions.
   Value *IV = CLI->getIndVar();
   Type *IVTy = IV->getType();
-  FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
+  FunctionCallee StaticInit =
+      LoopType == WorksharingLoopType::DistributeForStaticLoop
+          ? getKmpcDistForStaticInitForType(IVTy, M, *this)
+          : getKmpcForStaticInitForType(IVTy, M, *this);
   FunctionCallee StaticFini =
       getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
 
@@ -4200,9 +4229,15 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
 
   // Call the "init" function and update the trip count of the loop with the
   // value it produced.
-  Builder.CreateCall(StaticInit,
-                     {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
-                      PUpperBound, PStride, One, Zero});
+  SmallVector<Value *, 10> Args(
+      {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
+  if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
+    Value *PDistUpperBound =
+        Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
+    Args.push_back(PDistUpperBound);
+  }
+  Args.append({PStride, One, Zero});
+  Builder.CreateCall(StaticInit, Args);
   Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
   Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
   Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 84d9306..7d43de9 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1233,10 +1233,7 @@ Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const {
 const Instruction *
 Instruction::getPrevNonDebugInstruction(bool SkipPseudoOp) const {
   for (const Instruction *I = getPrevNode(); I; I = I->getPrevNode())
-    if (!isa<DbgInfoIntrinsic>(I) &&
-        !(SkipPseudoOp && isa<PseudoProbeInst>(I)) &&
-        !(isa<IntrinsicInst>(I) &&
-          cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::fake_use))
+    if (!isa<DbgInfoIntrinsic>(I) && !(SkipPseudoOp && isa<PseudoProbeInst>(I)))
       return I;
   return nullptr;
 }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index b4f9273..3477e2b 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5227,10 +5227,12 @@ void Verifier::visitInstruction(Instruction &I) {
                 F->getIntrinsicID() == Intrinsic::experimental_patchpoint ||
                 F->getIntrinsicID() == Intrinsic::fake_use ||
                 F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint ||
+                F->getIntrinsicID() == Intrinsic::wasm_throw ||
                 F->getIntrinsicID() == Intrinsic::wasm_rethrow ||
                 IsAttachedCallOperand(F, CBI, i),
             "Cannot invoke an intrinsic other than donothing, patchpoint, "
-            "statepoint, coro_resume, coro_destroy or clang.arc.attachedcall",
+            "statepoint, coro_resume, coro_destroy, clang.arc.attachedcall or "
+            "wasm.(re)throw",
             &I);
       Check(F->getParent() == &M, "Referencing function in another module!", &I,
             &M, F, F->getParent());
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index b0d92ae..cbee7f4 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -353,6 +353,11 @@ bool APFloatBase::semanticsHasNaN(const fltSemantics &semantics) {
   return semantics.nonFiniteBehavior != fltNonfiniteBehavior::FiniteOnly;
 }
 
+bool APFloatBase::isIEEELikeFP(const fltSemantics &semantics) {
+  // Keep in sync with Type::isIEEELikeFPTy
+  return SemanticsToEnum(semantics) <= S_IEEEquad;
+}
+
 bool APFloatBase::isRepresentableAsNormalIn(const fltSemantics &Src,
                                             const fltSemantics &Dst) {
   // Exponent range must be larger.
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index eaca75b..fc38bfe 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -485,10 +485,10 @@ void AArch64AsmPrinter::emitAttributes(unsigned Flags,
         AArch64BuildAttrs::SubsectionType::ULEB128);
     TS->emitAttribute(
         AArch64BuildAttrs::getVendorName(AArch64BuildAttrs::AEABI_PAUTHABI),
-        AArch64BuildAttrs::TAG_PAUTH_PLATFORM, PAuthABIPlatform, "", false);
+        AArch64BuildAttrs::TAG_PAUTH_PLATFORM, PAuthABIPlatform, "");
     TS->emitAttribute(
         AArch64BuildAttrs::getVendorName(AArch64BuildAttrs::AEABI_PAUTHABI),
-        AArch64BuildAttrs::TAG_PAUTH_SCHEMA, PAuthABIVersion, "", false);
+        AArch64BuildAttrs::TAG_PAUTH_SCHEMA, PAuthABIVersion, "");
   }
 
   unsigned BTIValue = (Flags & AArch64BuildAttrs::Feature_BTI_Flag) ? 1 : 0;
@@ -502,13 +502,13 @@ void AArch64AsmPrinter::emitAttributes(unsigned Flags,
                                 AArch64BuildAttrs::SubsectionType::ULEB128);
     TS->emitAttribute(AArch64BuildAttrs::getVendorName(
                           AArch64BuildAttrs::AEABI_FEATURE_AND_BITS),
-                      AArch64BuildAttrs::TAG_FEATURE_BTI, BTIValue, "", false);
+                      AArch64BuildAttrs::TAG_FEATURE_BTI, BTIValue, "");
     TS->emitAttribute(AArch64BuildAttrs::getVendorName(
                           AArch64BuildAttrs::AEABI_FEATURE_AND_BITS),
-                      AArch64BuildAttrs::TAG_FEATURE_PAC, PACValue, "", false);
+                      AArch64BuildAttrs::TAG_FEATURE_PAC, PACValue, "");
     TS->emitAttribute(AArch64BuildAttrs::getVendorName(
                           AArch64BuildAttrs::AEABI_FEATURE_AND_BITS),
-                      AArch64BuildAttrs::TAG_FEATURE_GCS, GCSValue, "", false);
+                      AArch64BuildAttrs::TAG_FEATURE_GCS, GCSValue, "");
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d519bfc..b00aa11 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2055,8 +2055,9 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
 
 bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
     const IntrinsicInst *I) const {
-  if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
-    return true;
+  assert(I->getIntrinsicID() ==
+             Intrinsic::experimental_vector_partial_reduce_add &&
+         "Unexpected intrinsic!");
   if (EnablePartialReduceNodes)
     return true;
 
@@ -16890,9 +16891,16 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
     // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
     // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
     // most one extra extend step is needed and using tbl is not profitable.
+    // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
+    // udot instruction.
     if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
       auto *SingleUser = cast<Instruction>(*I->user_begin());
-      if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
+      if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))) ||
+          (match(SingleUser,
+                 m_Intrinsic<Intrinsic::experimental_vector_partial_reduce_add>(
+                     m_Value(), m_Specific(I))) &&
+           !shouldExpandPartialReductionIntrinsic(
+               cast<IntrinsicInst>(SingleUser))))
         return false;
     }
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d45cd8e..32edfad 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4956,6 +4956,12 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
       (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
       (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
        isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
+       isREVMask(Mask, LT.second.getScalarSizeInBits(),
+                 LT.second.getVectorNumElements(), 16) ||
+       isREVMask(Mask, LT.second.getScalarSizeInBits(),
+                 LT.second.getVectorNumElements(), 32) ||
+       isREVMask(Mask, LT.second.getScalarSizeInBits(),
+                 LT.second.getVectorNumElements(), 64) ||
        // Check for non-zero lane splats
        all_of(drop_begin(Mask),
               [&Mask](int M) { return M < 0 || M == Mask[0]; })))
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 11fb746..6a973b0 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -7874,8 +7874,7 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
     IsOptional = AArch64BuildAttrs::getOptionalID(Optionality);
     if (AArch64BuildAttrs::OPTIONAL_NOT_FOUND == IsOptional) {
       Error(Parser.getTok().getLoc(),
-            AArch64BuildAttrs::getSubsectionOptionalUnknownError() + ": " +
-                Optionality);
+            AArch64BuildAttrs::getSubsectionOptionalUnknownError());
       return true;
     }
     if (SubsectionExists) {
@@ -7923,7 +7922,7 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
     Type = AArch64BuildAttrs::getTypeID(Name);
     if (AArch64BuildAttrs::TYPE_NOT_FOUND == Type) {
       Error(Parser.getTok().getLoc(),
-            AArch64BuildAttrs::getSubsectionTypeUnknownError() + ": " + Name);
+            AArch64BuildAttrs::getSubsectionTypeUnknownError());
       return true;
     }
     if (SubsectionExists) {
@@ -7952,6 +7951,7 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
     }
   }
   Parser.Lex();
+
   // Parsing finished, check for trailing tokens.
   if (Parser.getTok().isNot(llvm::AsmToken::EndOfStatement)) {
     Error(Parser.getTok().getLoc(), "unexpected token for AArch64 build "
@@ -7990,14 +7990,18 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
 
   StringRef TagStr = "";
   unsigned Tag;
-  if (Parser.getTok().is(AsmToken::Identifier)) {
+  if (Parser.getTok().is(AsmToken::Integer)) {
+    Tag = getTok().getIntVal();
+  } else if (Parser.getTok().is(AsmToken::Identifier)) {
     TagStr = Parser.getTok().getIdentifier();
     switch (ActiveSubsectionID) {
-    default:
-      assert(0 && "Subsection name error");
-      break;
     case AArch64BuildAttrs::VENDOR_UNKNOWN:
-      // Private subsection, accept any tag.
+      // Tag was provided as an unrecognized string instead of an unsigned
+      // integer
+      Error(Parser.getTok().getLoc(), "unrecognized Tag: '" + TagStr +
+                                          "' \nExcept for public subsections, "
+                                          "tags have to be an unsigned int.");
+      return true;
       break;
     case AArch64BuildAttrs::AEABI_PAUTHABI:
       Tag = AArch64BuildAttrs::getPauthABITagsID(TagStr);
@@ -8018,8 +8022,6 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
       }
       break;
     }
-  } else if (Parser.getTok().is(AsmToken::Integer)) {
-    Tag = getTok().getIntVal();
   } else {
     Error(Parser.getTok().getLoc(), "AArch64 build attributes tag not found");
     return true;
@@ -8063,10 +8065,9 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
     Error(Parser.getTok().getLoc(), "AArch64 build attributes value not found");
     return true;
   }
-  // Check for possible unaccepted values for known tags (AEABI_PAUTHABI,
-  // AEABI_FEATURE_AND_BITS)
-  if (!(ActiveSubsectionID == AArch64BuildAttrs::VENDOR_UNKNOWN) &&
-      TagStr != "") { // TagStr was a recognized string
+  // Check for possible unaccepted values for known tags
+  // (AEABI_FEATURE_AND_BITS)
+  if (ActiveSubsectionID == AArch64BuildAttrs::AEABI_FEATURE_AND_BITS) {
     if (0 != ValueInt && 1 != ValueInt) {
       Error(Parser.getTok().getLoc(),
             "unknown AArch64 build attributes Value for Tag '" + TagStr +
@@ -8075,7 +8076,8 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
     }
   }
   Parser.Lex();
-  // Parsing finished, check for trailing tokens.
+
+  // Parsing finished. Check for trailing tokens.
   if (Parser.getTok().isNot(llvm::AsmToken::EndOfStatement)) {
     Error(Parser.getTok().getLoc(),
           "unexpected token for AArch64 build attributes tag and value "
@@ -8084,13 +8086,11 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
   }
 
   if (unsigned(-1) != ValueInt) {
-    getTargetStreamer().emitAttribute(ActiveSubsectionName, Tag, ValueInt, "",
-                                      false);
+    getTargetStreamer().emitAttribute(ActiveSubsectionName, Tag, ValueInt, "");
   }
-
   if ("" != ValueStr) {
     getTargetStreamer().emitAttribute(ActiveSubsectionName, Tag, unsigned(-1),
-                                      ValueStr, false);
+                                      ValueStr);
   }
   return false;
 }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index af64fce..08cff5a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -152,7 +152,7 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
   }
 
   void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value,
-                     std::string String, bool Override) override {
+                     std::string String) override {
 
     // AArch64 build attributes for assembly attribute form:
     // .aeabi_attribute tag, value
@@ -164,19 +164,15 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
     unsigned VendorID = AArch64BuildAttrs::getVendorID(VendorName);
 
     switch (VendorID) {
-    default:
-      assert(0 && "Subsection name error");
-      break;
     case AArch64BuildAttrs::VENDOR_UNKNOWN:
       if (unsigned(-1) != Value) {
         OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value;
-        AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "",
-                                             Override);
+        AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "");
       }
       if ("" != String) {
         OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << String;
         AArch64TargetStreamer::emitAttribute(VendorName, Tag, unsigned(-1),
-                                             String, Override);
+                                             String);
       }
       break;
     // Note: AEABI_FEATURE_AND_BITS takes only unsigned values
@@ -186,16 +182,14 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
         OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value;
         // Keep the data structure consistent with the case of ELF emission
         // (important for llvm-mc asm parsing)
-        AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "",
-                                             Override);
+        AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "");
         break;
       case AArch64BuildAttrs::TAG_FEATURE_BTI:
       case AArch64BuildAttrs::TAG_FEATURE_GCS:
       case AArch64BuildAttrs::TAG_FEATURE_PAC:
-        OS << "\t.aeabi_attribute" << "\t"
-           << AArch64BuildAttrs::getFeatureAndBitsTagsStr(Tag) << ", " << Value;
-        AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "",
-                                             Override);
+        OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value << "\t// "
+           << AArch64BuildAttrs::getFeatureAndBitsTagsStr(Tag);
+        AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "");
         break;
       }
       break;
@@ -206,15 +200,13 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
         OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value;
         // Keep the data structure consistent with the case of ELF emission
         // (important for llvm-mc asm parsing)
-        AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "",
-                                             Override);
+        AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "");
         break;
       case AArch64BuildAttrs::TAG_PAUTH_PLATFORM:
       case AArch64BuildAttrs::TAG_PAUTH_SCHEMA:
-        OS << "\t.aeabi_attribute" << "\t"
-           << AArch64BuildAttrs::getPauthABITagsStr(Tag) << ", " << Value;
-        AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "",
-                                             Override);
+        OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value << "\t// "
+           << AArch64BuildAttrs::getPauthABITagsStr(Tag);
+        AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "");
         break;
       }
       break;
@@ -241,8 +233,8 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
     StringRef ParameterStr = getTypeStr(ParameterType);
 
     switch (SubsectionID) {
-    default: {
-      // Treated as a private subsection
+    case AArch64BuildAttrs::VENDOR_UNKNOWN: {
+      // Private subsection
       break;
     }
     case AArch64BuildAttrs::AEABI_PAUTHABI: {
@@ -431,13 +423,12 @@ void AArch64TargetELFStreamer::emitAtributesSubsection(
 }
 
 void AArch64TargetELFStreamer::emitAttribute(StringRef VendorName, unsigned Tag,
-                                             unsigned Value, std::string String,
-                                             bool Override) {
+                                             unsigned Value,
+                                             std::string String) {
   if (unsigned(-1) != Value)
-    AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "", Override);
+    AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "");
   if ("" != String)
-    AArch64TargetStreamer::emitAttribute(VendorName, Tag, unsigned(-1), String,
-                                         Override);
+    AArch64TargetStreamer::emitAttribute(VendorName, Tag, unsigned(-1), String);
 }
 
 void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 1ed4a81..028d919 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/ConstantPools.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -193,8 +194,7 @@ AArch64TargetStreamer::getAtributesSubsectionByName(StringRef Name) {
 }
 
 void AArch64TargetStreamer::emitAttribute(StringRef VendorName, unsigned Tag,
-                                          unsigned Value, std::string String,
-                                          bool Override) {
+                                          unsigned Value, std::string String) {
 
   if (unsigned(-1) == Value && "" == String) {
     assert(0 && "Arguments error");
@@ -214,22 +214,14 @@ void AArch64TargetStreamer::emitAttribute(StringRef VendorName, unsigned Tag,
         return;
       }
       for (MCELFStreamer::AttributeItem &Item : SubSection.Content) {
+        // Tag already exists
         if (Item.Tag == Tag) {
-          if (!Override) {
-            if ((unsigned(-1) != Value && Item.IntValue != Value) ||
-                ("" != String && Item.StringValue != String)) {
-              assert(0 &&
-                     "Can not add AArch64 build attribute: An attribute with "
-                     "the same tag and a different value already exists");
-              return;
-            } else {
-              // Case Item.IntValue == Value, no need to emit twice
-              assert(0 &&
-                     "AArch64 build attribute: An attribute with the same tag "
-                     "and a same value already exists");
-              return;
-            }
-          }
+          Item.Type = unsigned(-1) != Value
+                          ? MCELFStreamer::AttributeItem::NumericAttribute
+                          : MCELFStreamer::AttributeItem::TextAttribute;
+          Item.IntValue = unsigned(-1) != Value ? Value : unsigned(-1);
+          Item.StringValue = unsigned(-1) != Value ? "" : String;
+          return;
         }
       }
       if (unsigned(-1) != Value)
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 9183fb4..9cbb104 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -100,7 +100,7 @@ public:
                           AArch64BuildAttrs::SubsectionOptional IsOptional,
                           AArch64BuildAttrs::SubsectionType ParameterType);
   virtual void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value,
-                             std::string String, bool Override);
+                             std::string String);
   void activateAtributesSubsection(StringRef VendorName);
   std::unique_ptr<MCELFStreamer::AttributeSubSection>
   getActiveAtributesSubsection();
@@ -127,7 +127,7 @@ private:
       StringRef VendorName, AArch64BuildAttrs::SubsectionOptional IsOptional,
       AArch64BuildAttrs::SubsectionType ParameterType) override;
   void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value,
-                     std::string String, bool Override = false) override;
+                     std::string String) override;
   void emitInst(uint32_t Inst) override;
   void emitDirectiveVariantPCS(MCSymbol *Symbol) override;
   void finish() override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 649deee..740e52f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -223,8 +223,9 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
   };
 }
 
-static bool isRegisterSize(unsigned Size) {
-  return Size % 32 == 0 && Size <= MaxRegisterSize;
+static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
+  return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
+         Size <= MaxRegisterSize;
 }
 
 static bool isRegisterVectorElementType(LLT EltTy) {
@@ -240,8 +241,8 @@ static bool isRegisterVectorType(LLT Ty) {
 }
 
 // TODO: replace all uses of isRegisterType with isRegisterClassType
-static bool isRegisterType(LLT Ty) {
-  if (!isRegisterSize(Ty.getSizeInBits()))
+static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
+  if (!isRegisterSize(ST, Ty.getSizeInBits()))
     return false;
 
   if (Ty.isVector())
@@ -252,19 +253,21 @@ static bool isRegisterType(LLT Ty) {
 
 // Any combination of 32 or 64-bit elements up the maximum register size, and
 // multiples of v2s16.
-static LegalityPredicate isRegisterType(unsigned TypeIdx) {
-  return [=](const LegalityQuery &Query) {
-    return isRegisterType(Query.Types[TypeIdx]);
+static LegalityPredicate isRegisterType(const GCNSubtarget &ST,
+                                        unsigned TypeIdx) {
+  return [=, &ST](const LegalityQuery &Query) {
+    return isRegisterType(ST, Query.Types[TypeIdx]);
   };
 }
 
 // RegisterType that doesn't have a corresponding RegClass.
 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
 // should be removed.
-static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
-  return [=](const LegalityQuery &Query) {
+static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST,
+                                               unsigned TypeIdx) {
+  return [=, &ST](const LegalityQuery &Query) {
     LLT Ty = Query.Types[TypeIdx];
-    return isRegisterType(Ty) &&
+    return isRegisterType(ST, Ty) &&
            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
   };
 }
@@ -348,17 +351,20 @@ static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
                                                    V6S64, V7S64, V8S64, V16S64};
 
 // Checks whether a type is in the list of legal register types.
-static bool isRegisterClassType(LLT Ty) {
+static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
   if (Ty.isPointerOrPointerVector())
     Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
 
   return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
-         is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
+         is_contained(AllScalarTypes, Ty) ||
+         (ST.useRealTrue16Insts() && Ty == S16) ||
+         is_contained(AllS16Vectors, Ty);
 }
 
-static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
-  return [TypeIdx](const LegalityQuery &Query) {
-    return isRegisterClassType(Query.Types[TypeIdx]);
+static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST,
+                                             unsigned TypeIdx) {
+  return [&ST, TypeIdx](const LegalityQuery &Query) {
+    return isRegisterClassType(ST, Query.Types[TypeIdx]);
   };
 }
 
@@ -510,7 +516,7 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
 
 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
   const LLT Ty = Query.Types[0];
-  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
+  return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
 }
 
@@ -523,12 +529,12 @@ static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
   if (Size != MemSizeInBits)
     return Size <= 32 && Ty.isVector();
 
-  if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
+  if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
     return true;
 
   // Don't try to handle bitcasting vector ext loads for now.
   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
-         (Size <= 32 || isRegisterSize(Size)) &&
+         (Size <= 32 || isRegisterSize(ST, Size)) &&
          !isRegisterVectorElementType(Ty.getElementType());
 }
 
@@ -875,7 +881,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   getActionDefinitionsBuilder(G_BITCAST)
       // Don't worry about the size constraint.
-      .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
+      .legalIf(all(isRegisterClassType(ST, 0), isRegisterClassType(ST, 1)))
       .lower();
 
   getActionDefinitionsBuilder(G_CONSTANT)
@@ -890,7 +896,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampScalar(0, S16, S64);
 
   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
-      .legalIf(isRegisterClassType(0))
+      .legalIf(isRegisterClassType(ST, 0))
       // s1 and s16 are special cases because they have legal operations on
       // them, but don't really occupy registers in the normal way.
       .legalFor({S1, S16})
@@ -1779,7 +1785,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     unsigned IdxTypeIdx = 2;
 
     getActionDefinitionsBuilder(Op)
-      .customIf([=](const LegalityQuery &Query) {
+        .customIf([=](const LegalityQuery &Query) {
           const LLT EltTy = Query.Types[EltTypeIdx];
           const LLT VecTy = Query.Types[VecTypeIdx];
           const LLT IdxTy = Query.Types[IdxTypeIdx];
@@ -1800,36 +1806,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                   IdxTy.getSizeInBits() == 32 &&
                   isLegalVecType;
         })
-      .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
-                 bitcastToVectorElement32(VecTypeIdx))
-      //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
-      .bitcastIf(
-        all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
-        [=](const LegalityQuery &Query) {
-          // For > 64-bit element types, try to turn this into a 64-bit
-          // element vector since we may be able to do better indexing
-          // if this is scalar. If not, fall back to 32.
-          const LLT EltTy = Query.Types[EltTypeIdx];
-          const LLT VecTy = Query.Types[VecTypeIdx];
-          const unsigned DstEltSize = EltTy.getSizeInBits();
-          const unsigned VecSize = VecTy.getSizeInBits();
-
-          const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
-          return std::pair(
-              VecTypeIdx,
-              LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
-        })
-      .clampScalar(EltTypeIdx, S32, S64)
-      .clampScalar(VecTypeIdx, S32, S64)
-      .clampScalar(IdxTypeIdx, S32, S32)
-      .clampMaxNumElements(VecTypeIdx, S32, 32)
-      // TODO: Clamp elements for 64-bit vectors?
-      .moreElementsIf(
-        isIllegalRegisterType(VecTypeIdx),
-        moreElementsToNextExistingRegClass(VecTypeIdx))
-      // It should only be necessary with variable indexes.
-      // As a last resort, lower to the stack
-      .lower();
+        .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
+                       scalarOrEltNarrowerThan(VecTypeIdx, 32)),
+                   bitcastToVectorElement32(VecTypeIdx))
+        //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
+        .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
+                       scalarOrEltWiderThan(VecTypeIdx, 64)),
+                   [=](const LegalityQuery &Query) {
+                     // For > 64-bit element types, try to turn this into a
+                     // 64-bit element vector since we may be able to do better
+                     // indexing if this is scalar. If not, fall back to 32.
+                     const LLT EltTy = Query.Types[EltTypeIdx];
+                     const LLT VecTy = Query.Types[VecTypeIdx];
+                     const unsigned DstEltSize = EltTy.getSizeInBits();
+                     const unsigned VecSize = VecTy.getSizeInBits();
+
+                     const unsigned TargetEltSize =
+                         DstEltSize % 64 == 0 ? 64 : 32;
+                     return std::pair(VecTypeIdx,
+                                      LLT::fixed_vector(VecSize / TargetEltSize,
+                                                        TargetEltSize));
+                   })
+        .clampScalar(EltTypeIdx, S32, S64)
+        .clampScalar(VecTypeIdx, S32, S64)
+        .clampScalar(IdxTypeIdx, S32, S32)
+        .clampMaxNumElements(VecTypeIdx, S32, 32)
+        // TODO: Clamp elements for 64-bit vectors?
+        .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
+                        moreElementsToNextExistingRegClass(VecTypeIdx))
+        // It should only be necessary with variable indexes.
+        // As a last resort, lower to the stack
+        .lower();
   }
 
   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
@@ -1876,15 +1883,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   }
 
-  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
-    .legalForCartesianProduct(AllS32Vectors, {S32})
-    .legalForCartesianProduct(AllS64Vectors, {S64})
-    .clampNumElements(0, V16S32, V32S32)
-    .clampNumElements(0, V2S64, V16S64)
-    .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
-    .moreElementsIf(
-      isIllegalRegisterType(0),
-      moreElementsToNextExistingRegClass(0));
+  auto &BuildVector =
+      getActionDefinitionsBuilder(G_BUILD_VECTOR)
+          .legalForCartesianProduct(AllS32Vectors, {S32})
+          .legalForCartesianProduct(AllS64Vectors, {S64})
+          .clampNumElements(0, V16S32, V32S32)
+          .clampNumElements(0, V2S64, V16S64)
+          .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
+          .moreElementsIf(isIllegalRegisterType(ST, 0),
+                          moreElementsToNextExistingRegClass(0));
 
   if (ST.hasScalarPackInsts()) {
     BuildVector
@@ -1904,14 +1911,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .lower();
   }
 
-  BuildVector.legalIf(isRegisterType(0));
+  BuildVector.legalIf(isRegisterType(ST, 0));
 
   // FIXME: Clamp maximum size
   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
-    .legalIf(all(isRegisterType(0), isRegisterType(1)))
-    .clampMaxNumElements(0, S32, 32)
-    .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
-    .clampMaxNumElements(0, S16, 64);
+      .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
+      .clampMaxNumElements(0, S32, 32)
+      .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
+      .clampMaxNumElements(0, S16, 64);
 
   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
 
@@ -1932,34 +1939,40 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       return false;
     };
 
-    auto &Builder = getActionDefinitionsBuilder(Op)
-      .legalIf(all(isRegisterType(0), isRegisterType(1)))
-      .lowerFor({{S16, V2S16}})
-      .lowerIf([=](const LegalityQuery &Query) {
-          const LLT BigTy = Query.Types[BigTyIdx];
-          return BigTy.getSizeInBits() == 32;
-        })
-      // Try to widen to s16 first for small types.
-      // TODO: Only do this on targets with legal s16 shifts
-      .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
-      .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
-      .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
-      .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
-                           elementTypeIs(1, S16)),
-                       changeTo(1, V2S16))
-      // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
-      // worth considering the multiples of 64 since 2*192 and 2*384 are not
-      // valid.
-      .clampScalar(LitTyIdx, S32, S512)
-      .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
-      // Break up vectors with weird elements into scalars
-      .fewerElementsIf(
-        [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
-        scalarize(0))
-      .fewerElementsIf(
-        [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
-        scalarize(1))
-      .clampScalar(BigTyIdx, S32, MaxScalar);
+    auto &Builder =
+        getActionDefinitionsBuilder(Op)
+            .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
+            .lowerFor({{S16, V2S16}})
+            .lowerIf([=](const LegalityQuery &Query) {
+              const LLT BigTy = Query.Types[BigTyIdx];
+              return BigTy.getSizeInBits() == 32;
+            })
+            // Try to widen to s16 first for small types.
+            // TODO: Only do this on targets with legal s16 shifts
+            .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
+            .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
+            .moreElementsIf(isSmallOddVector(BigTyIdx),
+                            oneMoreElement(BigTyIdx))
+            .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
+                                 elementTypeIs(1, S16)),
+                             changeTo(1, V2S16))
+            // Clamp the little scalar to s8-s256 and make it a power of 2. It's
+            // not worth considering the multiples of 64 since 2*192 and 2*384
+            // are not valid.
+            .clampScalar(LitTyIdx, S32, S512)
+            .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
+            // Break up vectors with weird elements into scalars
+            .fewerElementsIf(
+                [=](const LegalityQuery &Query) {
+                  return notValidElt(Query, LitTyIdx);
+                },
+                scalarize(0))
+            .fewerElementsIf(
+                [=](const LegalityQuery &Query) {
+                  return notValidElt(Query, BigTyIdx);
+                },
+                scalarize(1))
+            .clampScalar(BigTyIdx, S32, MaxScalar);
 
     if (Op == G_MERGE_VALUES) {
       Builder.widenScalarIf(
@@ -3146,7 +3159,7 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
     } else {
       // Extract the subvector.
 
-      if (isRegisterType(ValTy)) {
+      if (isRegisterType(ST, ValTy)) {
         // If this a case where G_EXTRACT is legal, use it.
         // (e.g. <3 x s32> -> <4 x s32>)
         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
@@ -7400,13 +7413,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
     // TODO: Could insert G_ASSERT_ZEXT from s16
   case Intrinsic::r600_read_local_size_z:
-    return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
-  case Intrinsic::r600_read_global_size_x:
-    return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
-  case Intrinsic::r600_read_global_size_y:
-    return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
-  case Intrinsic::r600_read_global_size_z:
-    return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
+    return legalizeKernargMemParameter(MI, B,
+                                       SI::KernelInputOffsets::LOCAL_SIZE_Z);
   case Intrinsic::amdgcn_fdiv_fast:
     return legalizeFDIVFastIntrin(MI, MRI, B);
   case Intrinsic::amdgcn_is_shared:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 2d8dc9d..1c1a6da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -11,7 +11,7 @@ def SGPRRegBank : RegisterBank<"SGPR",
 >;
 
 def VGPRRegBank : RegisterBank<"VGPR",
-  [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024]
+  [VGPR_16_Lo128, VGPR_16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024]
 >;
 
 // It is helpful to distinguish conditions from ordinary SGPRs.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6ed0925..be7cdde 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8770,27 +8770,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                                     SI::KernelInputOffsets::NGROUPS_Z, Align(4),
                                     false);
-  case Intrinsic::r600_read_global_size_x:
-    if (Subtarget->isAmdHsaOS())
-      return emitNonHSAIntrinsicError(DAG, DL, VT);
-
-    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_X,
-                                    Align(4), false);
-  case Intrinsic::r600_read_global_size_y:
-    if (Subtarget->isAmdHsaOS())
-      return emitNonHSAIntrinsicError(DAG, DL, VT);
-
-    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y,
-                                    Align(4), false);
-  case Intrinsic::r600_read_global_size_z:
-    if (Subtarget->isAmdHsaOS())
-      return emitNonHSAIntrinsicError(DAG, DL, VT);
-
-    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z,
-                                    Align(4), false);
   case Intrinsic::r600_read_local_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca49ee..6f80dbc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -136,6 +136,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
   let isMoveImm = 1;
   let SchedRW = [Write64Bit];
   let Size = 4;
+  let VOP1 = 1; // Not entirely correct, but close enough.
   let UseNamedOperandTable = 1;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 924aa45..5a07887 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -35,7 +35,7 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
   cl::ReallyHidden,
   cl::init(true));
 
-std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
+std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
 
 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
@@ -351,9 +351,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
   static auto InitializeRegSplitPartsOnce = [this]() {
     for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
       unsigned Size = getSubRegIdxSize(Idx);
-      if (Size & 31)
+      if (Size & 15)
         continue;
-      std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
+      std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
       unsigned Pos = getSubRegIdxOffset(Idx);
       if (Pos % Size)
         continue;
@@ -3554,14 +3554,14 @@ bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI,
 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
                                                    unsigned EltSize) const {
   const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
-  assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
+  assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
 
-  const unsigned RegDWORDs = RegBitWidth / 32;
-  const unsigned EltDWORDs = EltSize / 4;
-  assert(RegSplitParts.size() + 1 >= EltDWORDs);
+  const unsigned RegHalves = RegBitWidth / 16;
+  const unsigned EltHalves = EltSize / 2;
+  assert(RegSplitParts.size() + 1 >= EltHalves);
 
-  const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
-  const unsigned NumParts = RegDWORDs / EltDWORDs;
+  const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
+  const unsigned NumParts = RegHalves / EltHalves;
 
   return ArrayRef(Parts.data(), NumParts);
 }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a434efb..a64180d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -37,11 +37,11 @@ private:
   BitVector RegPressureIgnoredUnits;
 
   /// Sub reg indexes for getRegSplitParts.
-  /// First index represents subreg size from 1 to 16 DWORDs.
+  /// First index represents subreg size from 1 to 32 Half DWORDS.
   /// The inner vector is sorted by bit offset.
   /// Provided a register can be fully split with given subregs,
   /// all elements of the inner vector combined give a full lane mask.
-  static std::array<std::vector<int16_t>, 16> RegSplitParts;
+  static std::array<std::vector<int16_t>, 32> RegSplitParts;
 
   // Table representing sub reg of given width and offset.
   // First index is subreg size: 32, 64, 96, 128, 160, 192, 224, 256, 512.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c521d0d..6a92e54 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2483,6 +2483,8 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
 // (move from MC* level to Target* level). Return size in bits.
 unsigned getRegBitWidth(unsigned RCID) {
   switch (RCID) {
+  case AMDGPU::VGPR_16RegClassID:
+  case AMDGPU::VGPR_16_Lo128RegClassID:
   case AMDGPU::SGPR_LO16RegClassID:
   case AMDGPU::AGPR_LO16RegClassID:
     return 16;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 83cc4b1..efcd997 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -770,8 +770,20 @@ public:
         continue;
       Intrinsic::ID ID = F.getIntrinsicID();
       switch (ID) {
-      default:
+      // NOTE: Skip dx_resource_casthandle here. They are
+      // resolved after this loop in cleanupHandleCasts.
+      case Intrinsic::dx_resource_casthandle:
+      // NOTE: llvm.dbg.value is supported as is in DXIL.
+      case Intrinsic::dbg_value:
+      case Intrinsic::not_intrinsic:
         continue;
+      default: {
+        DiagnosticInfoUnsupported Diag(
+            F, "Unsupported intrinsic for DXIL lowering");
+        M.getContext().diagnose(Diag);
+        HasErrors |= true;
+        break;
+      }
 #define DXIL_OP_INTRINSIC(OpCode, Intrin, ...)                                 \
   case Intrin:                                                                 \
     HasErrors |= replaceFunctionWithOp(                                        \
diff --git a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index d0580b9..6a48af5 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -94,6 +94,7 @@ BT::BitMask HexagonEvaluator::mask(Register Reg, unsigned Sub) const {
   bool IsSubLo = (Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo));
   switch (ID) {
     case Hexagon::DoubleRegsRegClassID:
+    case Hexagon::DoubleRegs_with_isub_hi_in_IntRegsLow8RegClassID:
     case Hexagon::HvxWRRegClassID:
     case Hexagon::HvxVQRRegClassID:
       return IsSubLo ? BT::BitMask(0, RW-1)
@@ -139,6 +140,7 @@ const TargetRegisterClass &HexagonEvaluator::composeWithSubRegIndex(
 
   switch (RC.getID()) {
     case Hexagon::DoubleRegsRegClassID:
+    case Hexagon::DoubleRegs_with_isub_hi_in_IntRegsLow8RegClassID:
       return Hexagon::IntRegsRegClass;
     case Hexagon::HvxWRRegClassID:
       return Hexagon::HvxVRRegClass;
diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
index cc41b56..2378bbc 100644
--- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -65,6 +65,8 @@ def CC_Hexagon: CallingConv<[
   CCIfType<[i32],
     CCIfSplit<
       CCCustom<"CC_SkipOdd">>>,
+  CCIfType<[v4i1],  CCPromoteToType<v4i16>>,
+  CCIfType<[v8i1],  CCPromoteToType<v8i8>>,
 
   CCIfType<[i32,v2i16,v4i8],
     CCAssignToReg<[R0,R1,R2,R3,R4,R5]>>,
@@ -111,6 +113,14 @@ class CCIfHvx128<CCAction A>
 
 def CC_Hexagon_HVX: CallingConv<[
   // HVX 64-byte mode
+
+  CCIfHvx64<
+        CCIfType<[v16i1], CCPromoteToType<v16i32>>>,
+  CCIfHvx64<
+        CCIfType<[v32i1], CCPromoteToType<v32i16>>>,
+  CCIfHvx64<
+        CCIfType<[v64i1], CCPromoteToType<v64i8>>>,
+
   CCIfHvx64<
     CCIfType<[v16i32,v32i16,v64i8],
       CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
@@ -125,6 +135,14 @@ def CC_Hexagon_HVX: CallingConv<[
       CCAssignToStack<128,64>>>,
 
   // HVX 128-byte mode
+
+  CCIfHvx128<
+        CCIfType<[v32i1], CCPromoteToType<v32i32>>>,
+  CCIfHvx128<
+        CCIfType<[v64i1], CCPromoteToType<v64i16>>>,
+  CCIfHvx128<
+        CCIfType<[v128i1], CCPromoteToType<v128i8>>>,
+
   CCIfHvx128<
     CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
       CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 6f0bf51..971a128 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1106,9 +1106,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
                           std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
       break;
     }
-    if (!Opcode)
-      return false;
-    Ops.append({Base, Offset, Chain});
   } else {
     if (PointerSize == 64) {
       SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1148,10 +1145,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
         break;
       }
     }
-    if (!Opcode)
-      return false;
-    Ops.append({Base, Offset, Chain});
   }
+  if (!Opcode)
+    return false;
+  Ops.append({Base, Offset, Chain});
   LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
 
   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
@@ -1202,63 +1199,59 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
   std::optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *LD;
-  SDValue Base, Offset, Addr;
+  SDValue Base, Offset;
 
-  if (SelectDirectAddr(Op1, Addr)) {
+  if (SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case ISD::LOAD:
       Opcode = pickOpcodeForVT(
-          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
-          NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
-          NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
-          NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
+          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8asi,
+          NVPTX::INT_PTX_LDG_GLOBAL_i16asi, NVPTX::INT_PTX_LDG_GLOBAL_i32asi,
+          NVPTX::INT_PTX_LDG_GLOBAL_i64asi, NVPTX::INT_PTX_LDG_GLOBAL_f32asi,
+          NVPTX::INT_PTX_LDG_GLOBAL_f64asi);
       break;
     case ISD::INTRINSIC_W_CHAIN:
       Opcode = pickOpcodeForVT(
-          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
-          NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
-          NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
-          NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
+          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8asi,
+          NVPTX::INT_PTX_LDU_GLOBAL_i16asi, NVPTX::INT_PTX_LDU_GLOBAL_i32asi,
+          NVPTX::INT_PTX_LDU_GLOBAL_i64asi, NVPTX::INT_PTX_LDU_GLOBAL_f32asi,
+          NVPTX::INT_PTX_LDU_GLOBAL_f64asi);
       break;
     case NVPTXISD::LoadV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                               NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
-                               NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
-                               NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
-                               NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
-                               NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
-                               NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
+                               NVPTX::INT_PTX_LDG_G_v2i8_ELE_asi,
+                               NVPTX::INT_PTX_LDG_G_v2i16_ELE_asi,
+                               NVPTX::INT_PTX_LDG_G_v2i32_ELE_asi,
+                               NVPTX::INT_PTX_LDG_G_v2i64_ELE_asi,
+                               NVPTX::INT_PTX_LDG_G_v2f32_ELE_asi,
+                               NVPTX::INT_PTX_LDG_G_v2f64_ELE_asi);
       break;
     case NVPTXISD::LDUV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                               NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
-                               NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
-                               NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
-                               NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
-                               NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
-                               NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
+                               NVPTX::INT_PTX_LDU_G_v2i8_ELE_asi,
+                               NVPTX::INT_PTX_LDU_G_v2i16_ELE_asi,
+                               NVPTX::INT_PTX_LDU_G_v2i32_ELE_asi,
+                               NVPTX::INT_PTX_LDU_G_v2i64_ELE_asi,
+                               NVPTX::INT_PTX_LDU_G_v2f32_ELE_asi,
+                               NVPTX::INT_PTX_LDU_G_v2f64_ELE_asi);
       break;
     case NVPTXISD::LoadV4:
       Opcode = pickOpcodeForVT(
-          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
-          NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
-          NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
-          NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
+          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_asi,
+          NVPTX::INT_PTX_LDG_G_v4i16_ELE_asi,
+          NVPTX::INT_PTX_LDG_G_v4i32_ELE_asi, std::nullopt,
+          NVPTX::INT_PTX_LDG_G_v4f32_ELE_asi, std::nullopt);
       break;
     case NVPTXISD::LDUV4:
       Opcode = pickOpcodeForVT(
-          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
-          NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
-          NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
-          NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
+          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_asi,
+          NVPTX::INT_PTX_LDU_G_v4i16_ELE_asi,
+          NVPTX::INT_PTX_LDU_G_v4i32_ELE_asi, std::nullopt,
+          NVPTX::INT_PTX_LDU_G_v4f32_ELE_asi, std::nullopt);
       break;
     }
-    if (!Opcode)
-      return false;
-    SDValue Ops[] = { Addr, Chain };
-    LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
   } else {
     if (TM.is64Bit()) {
       SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1369,11 +1362,11 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         break;
       }
     }
-    if (!Opcode)
-      return false;
-    SDValue Ops[] = {Base, Offset, Chain};
-    LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
   }
+  if (!Opcode)
+    return false;
+  SDValue Ops[] = {Base, Offset, Chain};
+  LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
 
   // For automatic generation of LDG (through SelectLoad[Vector], not the
   // intrinsics), we may have an extending load like:
@@ -1577,7 +1570,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
                           std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
       break;
     }
-    Ops.append({Base, Offset});
   } else {
     if (PointerSize == 64) {
       SelectADDRri64(N2.getNode(), N2, Base, Offset);
@@ -1617,12 +1609,10 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
         break;
       }
     }
-    Ops.append({Base, Offset});
   }
   if (!Opcode)
     return false;
-
-  Ops.push_back(Chain);
+  Ops.append({Base, Offset, Chain});
 
   ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c98d8a3..3373f9e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2718,23 +2718,23 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
 // Scalar
 
 multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
- def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
-               !strconcat("ldu.global.", TyStr),
+ def asi:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
+               "ldu.global." # TyStr # " \t$result, [$src$offset];",
                       []>, Requires<[hasLDU]>;
  def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
-               !strconcat("ldu.global.", TyStr),
+               "ldu.global." # TyStr # " \t$result, [$src];",
                       []>, Requires<[hasLDU]>;
  def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
-               !strconcat("ldu.global.", TyStr),
+               "ldu.global." # TyStr # " \t$result, [$src];",
                         []>, Requires<[hasLDU]>;
 }
 
-defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
-defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
-defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
-defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;
 
 // vector
 
@@ -2742,56 +2742,40 @@ defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
 multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
                      (ins MEMri:$src),
-                     !strconcat("ldu.global.", TyStr), []>;
+                     "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
                      (ins MEMri64:$src),
-                     !strconcat("ldu.global.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
-                     (ins imemAny:$src),
-                     !strconcat("ldu.global.", TyStr), []>;
+                     "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins imemAny:$src, Offseti32imm:$offset),
+                     "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
 }
 
 multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
                             regclass:$dst4), (ins MEMri:$src),
-               !strconcat("ldu.global.", TyStr), []>;
+               "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
                             regclass:$dst4), (ins MEMri64:$src),
-               !strconcat("ldu.global.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-                            regclass:$dst4), (ins imemAny:$src),
-               !strconcat("ldu.global.", TyStr), []>;
-}
-
-defm INT_PTX_LDU_G_v2i8_ELE
-  : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
-defm INT_PTX_LDU_G_v2i16_ELE
-  : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDU_G_v2i32_ELE
-  : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
-defm INT_PTX_LDU_G_v2f32_ELE
-  : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
-defm INT_PTX_LDU_G_v2i64_ELE
-  : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
-defm INT_PTX_LDU_G_v2f64_ELE
-  : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
-defm INT_PTX_LDU_G_v4i8_ELE
-  : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDU_G_v4i16_ELE
-  : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Int16Regs>;
-defm INT_PTX_LDU_G_v4i32_ELE
-  : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Int32Regs>;
-defm INT_PTX_LDU_G_v4f16_ELE
-  : VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Int16Regs>;
-defm INT_PTX_LDU_G_v4f16x2_ELE
-  : VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Int32Regs>;
-defm INT_PTX_LDU_G_v4f32_ELE
-  : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Float32Regs>;
+               "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
+               "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>;
+
+defm INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE  : VLDU_G_ELE_V4<"u32", Int32Regs>;
+defm INT_PTX_LDU_G_v4f16_ELE   : VLDU_G_ELE_V4<"b16", Int16Regs>;
+defm INT_PTX_LDU_G_v4f16x2_ELE  : VLDU_G_ELE_V4<"b32", Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE  : VLDU_G_ELE_V4<"f32", Float32Regs>;
 
 
 //-----------------------------------
@@ -2803,29 +2787,23 @@ defm INT_PTX_LDU_G_v4f32_ELE
 // during the lifetime of the kernel.
 
 multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
- def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
-               !strconcat("ld.global.nc.", TyStr),
+ def asi:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
+               "ld.global.nc." # TyStr # " \t$result, [$src$offset];",
                       []>, Requires<[hasLDG]>;
  def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
-               !strconcat("ld.global.nc.", TyStr),
+               "ld.global.nc." # TyStr # " \t$result, [$src];",
                       []>, Requires<[hasLDG]>;
  def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
-               !strconcat("ld.global.nc.", TyStr),
+               "ld.global.nc." # TyStr # " \t$result, [$src];",
                         []>, Requires<[hasLDG]>;
 }
 
-defm INT_PTX_LDG_GLOBAL_i8
-  : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDG_GLOBAL_i16
-  : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDG_GLOBAL_i32
-  : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
-defm INT_PTX_LDG_GLOBAL_i64
-  : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
-defm INT_PTX_LDG_GLOBAL_f32
-  : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
-defm INT_PTX_LDG_GLOBAL_f64
-  : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
+defm INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;
 
 // vector
 
@@ -2833,54 +2811,39 @@ defm INT_PTX_LDG_GLOBAL_f64
 multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
                      (ins MEMri:$src),
-                     !strconcat("ld.global.nc.", TyStr), []>;
+                     "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
                      (ins MEMri64:$src),
-                     !strconcat("ld.global.nc.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
-                     (ins imemAny:$src),
-                     !strconcat("ld.global.nc.", TyStr), []>;
+                     "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins imemAny:$src, Offseti32imm:$offset),
+                     "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
 }
 
 multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
-  def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-                              regclass:$dst4), (ins Int32Regs:$src),
-               !strconcat("ld.global.nc.", TyStr), []>;
-  def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-                               regclass:$dst4), (ins Int64Regs:$src),
-               !strconcat("ld.global.nc.", TyStr), []>;
   def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
                               regclass:$dst4), (ins MEMri:$src),
-               !strconcat("ld.global.nc.", TyStr), []>;
+               "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
   def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
                               regclass:$dst4), (ins MEMri64:$src),
-               !strconcat("ld.global.nc.", TyStr), []>;
-  def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-                             regclass:$dst4), (ins imemAny:$src),
-               !strconcat("ld.global.nc.", TyStr), []>;
+               "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+  def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                             regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
+               "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
 }
 
 // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
-defm INT_PTX_LDG_G_v2i8_ELE
-  : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
-defm INT_PTX_LDG_G_v2i16_ELE
-  : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v2i32_ELE
-  : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
-defm INT_PTX_LDG_G_v2f32_ELE
-  : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
-defm INT_PTX_LDG_G_v2i64_ELE
-  : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
-defm INT_PTX_LDG_G_v2f64_ELE
-  : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
-defm INT_PTX_LDG_G_v4i8_ELE
-  : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v4i16_ELE
-  : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v4i32_ELE
-  : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
-defm INT_PTX_LDG_G_v4f32_ELE
-  : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
+defm INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>;
+defm INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>;
+defm INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>;
+defm INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>;
+defm INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>;
+defm INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>;
+
+defm INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>;
+defm INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>;
+defm INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
+defm INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
 
 
 multiclass NG_TO_G<string Str> {
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 8c07d87..1025b57 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -328,6 +328,19 @@ static DecodeStatus decodeUImmOperand(MCInst &Inst, uint32_t Imm,
   return MCDisassembler::Success;
 }
 
+template <unsigned Width, unsigned LowerBound>
+static DecodeStatus decodeUImmOperandGE(MCInst &Inst, uint32_t Imm,
+                                        int64_t Address,
+                                        const MCDisassembler *Decoder) {
+  assert(isUInt<Width>(Imm) && "Invalid immediate");
+
+  if (Imm < LowerBound)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createImm(Imm));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeUImmLog2XLenOperand(MCInst &Inst, uint32_t Imm,
                                               int64_t Address,
                                               const MCDisassembler *Decoder) {
@@ -606,6 +619,17 @@ void RISCVDisassembler::addSPOperands(MCInst &MI) const {
                                           (void)nullptr)
 #define TRY_TO_DECODE_FEATURE(FEATURE, DECODER_TABLE, DESC)                    \
   TRY_TO_DECODE(STI.hasFeature(FEATURE), DECODER_TABLE, DESC)
+#define TRY_TO_DECODE_FEATURE_ANY(FEATURES, DECODER_TABLE, DESC)               \
+  TRY_TO_DECODE((STI.getFeatureBits() & (FEATURES)).any(), DECODER_TABLE, DESC)
+
+static constexpr FeatureBitset XqciFeatureGroup = {
+    RISCV::FeatureVendorXqcia,   RISCV::FeatureVendorXqciac,
+    RISCV::FeatureVendorXqcicli, RISCV::FeatureVendorXqcicm,
+    RISCV::FeatureVendorXqcics,  RISCV::FeatureVendorXqcicsr,
+    RISCV::FeatureVendorXqciint, RISCV::FeatureVendorXqcilia,
+    RISCV::FeatureVendorXqcilo,  RISCV::FeatureVendorXqcilsm,
+    RISCV::FeatureVendorXqcisls,
+};
 
 DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                                                  ArrayRef<uint8_t> Bytes,
@@ -693,24 +717,10 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                         "CORE-V SIMD extensions");
   TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbi, DecoderTableXCVbi32,
                         "CORE-V Immediate Branching");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcicsr, DecoderTableXqcicsr32,
-                        "Qualcomm uC CSR");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcisls, DecoderTableXqcisls32,
-                        "Qualcomm uC Scaled Load Store");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcia, DecoderTableXqcia32,
-                        "Qualcomm uC Arithmetic");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcics, DecoderTableXqcics32,
-                        "Qualcomm uC Conditional Select");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcilsm, DecoderTableXqcilsm32,
-                        "Qualcomm uC Load Store Multiple");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqciac, DecoderTableXqciac32,
-                        "Qualcomm uC Load-Store Address Calculation");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcicli, DecoderTableXqcicli32,
-                        "Qualcomm uC Conditional Load Immediate");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcicm, DecoderTableXqcicm32,
-                        "Qualcomm uC Conditional Move");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqciint, DecoderTableXqciint32,
-                        "Qualcomm uC Interrupts");
+
+  TRY_TO_DECODE_FEATURE_ANY(XqciFeatureGroup, DecoderTableXqci32,
+                            "Qualcomm uC Extensions");
+
   TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXRivosVizip, DecoderTableXRivos32,
                         "Rivos");
 
@@ -739,12 +749,10 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
                         "Zcmt (16-bit Table Jump Instructions)");
   TRY_TO_DECODE_FEATURE(RISCV::FeatureStdExtZcmp, DecoderTableRVZcmp16,
                         "Zcmp (16-bit Push/Pop & Double Move Instructions)");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqciac, DecoderTableXqciac16,
-                        "Qualcomm uC Load-Store Address Calculation 16bit");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcicm, DecoderTableXqcicm16,
-                        "Qualcomm uC Conditional Move 16bit");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqciint, DecoderTableXqciint16,
-                        "Qualcomm uC Interrupts 16bit");
+
+  TRY_TO_DECODE_FEATURE_ANY(XqciFeatureGroup, DecoderTableXqci16,
+                            "Qualcomm uC 16bit");
+
   TRY_TO_DECODE_AND_ADD_SP(STI.hasFeature(RISCV::FeatureVendorXwchc),
                            DecoderTableXwchc16, "WCH QingKe XW");
   TRY_TO_DECODE_AND_ADD_SP(true, DecoderTable16,
@@ -767,10 +775,8 @@ DecodeStatus RISCVDisassembler::getInstruction48(MCInst &MI, uint64_t &Size,
   for (size_t i = Size; i-- != 0;) {
     Insn += (static_cast<uint64_t>(Bytes[i]) << 8 * i);
   }
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcilia, DecoderTableXqcilia48,
-                        "Qualcomm uC Large Immediate Arithmetic 48bit");
-  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcilo, DecoderTableXqcilo48,
-                        "Qualcomm uC Large Offset Load Store 48bit");
+  TRY_TO_DECODE_FEATURE_ANY(XqciFeatureGroup, DecoderTableXqci48,
+                            "Qualcomm uC 48bit");
 
   return MCDisassembler::Fail;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index a962e64..c775561 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -2033,6 +2033,12 @@ def : PatGprImm<binop_allwusers<and>, ANDI, u32simm12>;
 def : PatGprImm<binop_allwusers<or>,  ORI,  u32simm12>;
 def : PatGprImm<binop_allwusers<xor>, XORI, u32simm12>;
 
+// Select 'or' as ADDIW if the immediate bits are known to be 0 in $rs1 and
+// $rs1 is sign extended. This can improve compressibility. Using ADDIW gives
+// more power to RISCVOptWInstrs.
+def : Pat<(or_is_add 33signbits_node:$rs1, simm12:$imm),
+          (ADDIW $rs1, simm12:$imm)>;
+
 /// Loads
 
 def : LdPat<sextloadi32, LW, i64>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
index 873fa15..4890c3b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
@@ -18,10 +18,10 @@
 let Predicates = [HasVendorXRivosVizip], DecoderNamespace = "XRivos",
   Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather,
   Inst<6-0> = OPC_CUSTOM_2.Value in  {
-defm RV_VZIPEVEN_V : VALU_IV_V<"rv.vzipeven", 0b001100>;
-defm RV_VZIPODD_V : VALU_IV_V<"rv.vzipodd", 0b011100>;
-defm RV_VZIP2A_V : VALU_IV_V<"rv.vzip2a", 0b000100>;
-defm RV_VZIP2B_V : VALU_IV_V<"rv.vzip2b", 0b010100>;
-defm RV_VUNZIP2A_V : VALU_IV_V<"rv.vunzip2a", 0b001000>;
-defm RV_VUNZIP2B_V : VALU_IV_V<"rv.vunzip2b", 0b011000>;
+defm RI_VZIPEVEN_V : VALU_IV_V<"ri.vzipeven", 0b001100>;
+defm RI_VZIPODD_V : VALU_IV_V<"ri.vzipodd", 0b011100>;
+defm RI_VZIP2A_V : VALU_IV_V<"ri.vzip2a", 0b000100>;
+defm RI_VZIP2B_V : VALU_IV_V<"ri.vzip2b", 0b010100>;
+defm RI_VUNZIP2A_V : VALU_IV_V<"ri.vunzip2a", 0b001000>;
+defm RI_VUNZIP2B_V : VALU_IV_V<"ri.vunzip2b", 0b011000>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 3a8039f..b1283d0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -24,7 +24,7 @@ def uimm5nonzero : RISCVOp<XLenVT>,
 def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT,
   [{return (Imm > 3) && isUInt<5>(Imm);}]> {
   let ParserMatchClass = UImmAsmOperand<5, "GT3">;
-  let DecoderMethod = "decodeUImmOperand<5>";
+  let DecoderMethod = "decodeUImmOperandGE<5, 4>";
   let OperandType = "OPERAND_UIMM5_GT3";
 }
 
@@ -283,7 +283,9 @@ class QCIRVInstEI<bits<3> funct3, bits<2> funct2, string opcodestr>
 // Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasVendorXqcicsr, IsRV32], DecoderNamespace = "Xqcicsr" in {
+let DecoderNamespace = "Xqci" in {
+
+let Predicates = [HasVendorXqcicsr, IsRV32] in {
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
   def QC_CSRRWR : RVInstR<0b1000110, 0b000, OPC_SYSTEM, (outs GPR:$rd),
                           (ins GPR:$rs1, GPRNoX0:$rs2), "qc.csrrwr",
@@ -293,9 +295,9 @@ let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
                            (ins uimm5:$rs1, GPRNoX0:$rs2), "qc.csrrwri",
                            "$rd, $rs1, $rs2">;
 } // hasSideEffects = 1, mayLoad = 0, mayStore = 0
-} // Predicates = [HasVendorXqcicsr, IsRV32], DecoderNamespace = "Xqcicsr"
+} // Predicates = [HasVendorXqcicsr, IsRV32]
 
-let Predicates = [HasVendorXqcisls, IsRV32], DecoderNamespace = "Xqcisls" in {
+let Predicates = [HasVendorXqcisls, IsRV32] in {
   def  QC_LRB  : QCILoad_ScaleIdx<0b1000, "qc.lrb">;
   def  QC_LRH  : QCILoad_ScaleIdx<0b1001, "qc.lrh">;
   def  QC_LRW  : QCILoad_ScaleIdx<0b1010, "qc.lrw">;
@@ -305,9 +307,9 @@ let Predicates = [HasVendorXqcisls, IsRV32], DecoderNamespace = "Xqcisls" in {
   def  QC_SRB  : QCIStore_ScaleIdx<0b1101, "qc.srb">;
   def  QC_SRH  : QCIStore_ScaleIdx<0b1110, "qc.srh">;
   def  QC_SRW  : QCIStore_ScaleIdx<0b1111, "qc.srw">;
-} // Predicates = [HasVendorXqcisls, IsRV32], DecoderNamespace = "Xqcisls"
+} // Predicates = [HasVendorXqcisls, IsRV32]
 
-let Predicates = [HasVendorXqcia, IsRV32], DecoderNamespace = "Xqcia" in {
+let Predicates = [HasVendorXqcia, IsRV32] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   def QC_SLASAT : QCIRVInstRR<0b01010, GPRNoX0, "qc.slasat">;
   def QC_SLLSAT : QCIRVInstRR<0b01100, GPRNoX0, "qc.sllsat">;
@@ -329,9 +331,9 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   def QC_NORMU : QCIRVInstR<0b1000, "qc.normu">;
   def QC_NORMEU : QCIRVInstR<0b1001, "qc.normeu">;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
-} // Predicates = [HasVendorXqcia, IsRV32], DecoderNamespace = "Xqcia"
+} // Predicates = [HasVendorXqcia, IsRV32]
 
-let Predicates = [HasVendorXqciac, IsRV32], DecoderNamespace = "Xqciac" in {
+let Predicates = [HasVendorXqciac, IsRV32] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   def QC_C_MULIADD : RVInst16CL<0b001, 0b10, (outs GPRC:$rd_wb),
                                (ins GPRC:$rd, GPRC:$rs1, uimm5:$uimm),
@@ -360,9 +362,9 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   }
 
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
-} // Predicates = [HasVendorXqciac, IsRV32], DecoderNamespace = "Xqciac"
+} // Predicates = [HasVendorXqciac, IsRV32]
 
-let Predicates = [HasVendorXqcics, IsRV32], DecoderNamespace = "Xqcics" in {
+let Predicates = [HasVendorXqcics, IsRV32] in {
   def QC_SELECTIIEQ : QCISELECTIICC <0b010, "qc.selectiieq">;
   def QC_SELECTIINE : QCISELECTIICC <0b011, "qc.selectiine">;
   def QC_SELECTIEQ  : QCISELECTICC  <0b010, "qc.selectieq">;
@@ -371,9 +373,9 @@ let Predicates = [HasVendorXqcics, IsRV32], DecoderNamespace = "Xqcics" in {
   def QC_SELECTNEI  : QCISELECTCCI  <0b011, "qc.selectnei">;
   def QC_SELECTIEQI : QCISELECTICCI <0b010, "qc.selectieqi">;
   def QC_SELECTINEI : QCISELECTICCI <0b011, "qc.selectinei">;
-} // Predicates = [HasVendorXqcics, IsRV32], DecoderNamespace = "Xqcics"
+} // Predicates = [HasVendorXqcics, IsRV32]
 
-let Predicates = [HasVendorXqcilsm, IsRV32], DecoderNamespace = "Xqcilsm" in {
+let Predicates = [HasVendorXqcilsm, IsRV32] in {
     def QC_SWM : QCIStoreMultiple<0b00, GPRNoX0, "qc.swm">;
     def QC_SWMI : QCIStoreMultiple<0b01, uimm5nonzero, "qc.swmi">;
     def QC_SETWM : QCIStoreMultiple<0b10, GPRNoX0, "qc.setwm">;
@@ -381,9 +383,9 @@ let Predicates = [HasVendorXqcilsm, IsRV32], DecoderNamespace = "Xqcilsm" in {
 
     def QC_LWM : QCILoadMultiple<0b00, GPRNoX0, "qc.lwm">;
     def QC_LWMI : QCILoadMultiple<0b01, uimm5nonzero, "qc.lwmi">;
-} // Predicates = [HasVendorXqcilsm, IsRV32], DecoderNamespace = "Xqcilsm"
+} // Predicates = [HasVendorXqcilsm, IsRV32]
 
-let Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli" in {
+let Predicates = [HasVendorXqcicli, IsRV32] in {
   def QC_LIEQ    : QCILICC<0b000, 0b01, GPRNoX0, "qc.lieq">;
   def QC_LINE    : QCILICC<0b001, 0b01, GPRNoX0, "qc.line">;
   def QC_LILT    : QCILICC<0b100, 0b01, GPRNoX0, "qc.lilt">;
@@ -397,9 +399,9 @@ let Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli" in {
   def QC_LIGEI   : QCILICC<0b101, 0b11, simm5, "qc.ligei">;
   def QC_LILTUI  : QCILICC<0b110, 0b11, uimm5, "qc.liltui">;
   def QC_LIGEUI  : QCILICC<0b111, 0b11, uimm5, "qc.ligeui">;
-} // Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli"
+} // Predicates = [HasVendorXqcicli, IsRV32]
 
-let Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm" in {
+let Predicates = [HasVendorXqcicm, IsRV32] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
   def QC_C_MVEQZ   : RVInst16CL<0b101, 0b10, (outs GPRC:$rd_wb),
                               (ins GPRC:$rd, GPRC:$rs1),
@@ -423,9 +425,9 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
   def QC_MVGEI   : QCIMVCCI<0b101, "qc.mvgei", simm5>;
   def QC_MVLTUI  : QCIMVCCI<0b110, "qc.mvltui", uimm5>;
   def QC_MVGEUI  : QCIMVCCI<0b111, "qc.mvgeui", uimm5>;
-} // Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm"
+} // Predicates = [HasVendorXqcicm, IsRV32]
 
-let Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint" in {
+let Predicates = [HasVendorXqciint, IsRV32] in {
   let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
   def QC_C_DIR : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd), (ins),
                             "qc.c.dir", "$rd"> {
@@ -455,9 +457,9 @@ let Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint" in {
 
   let mayLoad = 1, mayStore = 1, isReturn = 1, isTerminator = 1 in
   def QC_C_MILEAVERET   : QCIRVInst16CI_NONE<0b10100, "qc.c.mileaveret">;
-} // Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint"
+} // Predicates = [HasVendorXqciint, IsRV32]
 
-let Predicates = [HasVendorXqcilo, IsRV32], DecoderNamespace = "Xqcilo" in {
+let Predicates = [HasVendorXqcilo, IsRV32] in {
   def QC_E_LB    : QCIRVInstEILoad<0b101, 0b00, "qc.e.lb">;
   def QC_E_LBU   : QCIRVInstEILoad<0b101, 0b01, "qc.e.lbu">;
   def QC_E_LH    : QCIRVInstEILoad<0b101, 0b10, "qc.e.lh">;
@@ -467,9 +469,9 @@ let Predicates = [HasVendorXqcilo, IsRV32], DecoderNamespace = "Xqcilo" in {
   def QC_E_SB    : QCIRVInstESStore<0b110, 0b01, "qc.e.sb">;
   def QC_E_SH    : QCIRVInstESStore<0b110, 0b10, "qc.e.sh">;
   def QC_E_SW    : QCIRVInstESStore<0b110, 0b11, "qc.e.sw">;
-} // Predicates = [HasVendorXqcilo, IsRV32], DecoderNamespace = "Xqcilo"
+} // Predicates = [HasVendorXqcilo, IsRV32]
 
-let Predicates = [HasVendorXqcilia, IsRV32], DecoderNamespace = "Xqcilia" in {
+let Predicates = [HasVendorXqcilia, IsRV32] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   def QC_E_XORAI : QCIRVInstEAI<0b001, 0b0, "qc.e.xorai">;
   def QC_E_ORAI  : QCIRVInstEAI<0b001, 0b1, "qc.e.orai" >;
@@ -481,7 +483,9 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   def QC_E_ADDI  : QCIRVInstEI<0b011, 0b10, "qc.e.addi">;
   def QC_E_ANDI  : QCIRVInstEI<0b011, 0b11, "qc.e.andi">;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
-} // Predicates = [HasVendorXqcilia, IsRV32], DecoderNamespace = "Xqcilia"
+} // Predicates = [HasVendorXqcilia, IsRV32]
+
+} // DecoderNamespace = "Xqci"
 
 //===----------------------------------------------------------------------===//
 // Aliases
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
index 9dfbcf6..1740ebb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
@@ -31,7 +31,7 @@ def uimm2_lsb0 : RISCVOp,
 
 def uimm8ge32 : RISCVOp {
   let ParserMatchClass = UImmAsmOperand<8, "GE32">;
-  let DecoderMethod = "decodeUImmOperand<8>";
+  let DecoderMethod = "decodeUImmOperandGE<8, 32>";
   let OperandType = "OPERAND_UIMM8_GE32";
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 3e9ce1c..4aeb503 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -445,12 +445,12 @@ static std::tuple<Register, SPIRVType *>
 buildBoolRegister(MachineIRBuilder &MIRBuilder, const SPIRVType *ResultType,
                   SPIRVGlobalRegistry *GR) {
   LLT Type;
-  SPIRVType *BoolType = GR->getOrCreateSPIRVBoolType(MIRBuilder);
+  SPIRVType *BoolType = GR->getOrCreateSPIRVBoolType(MIRBuilder, true);
 
   if (ResultType->getOpcode() == SPIRV::OpTypeVector) {
     unsigned VectorElements = ResultType->getOperand(2).getImm();
-    BoolType =
-        GR->getOrCreateSPIRVVectorType(BoolType, VectorElements, MIRBuilder);
+    BoolType = GR->getOrCreateSPIRVVectorType(BoolType, VectorElements,
+                                              MIRBuilder, true);
     const FixedVectorType *LLVMVectorType =
         cast<FixedVectorType>(GR->getTypeForSPIRVType(BoolType));
     Type = LLT::vector(LLVMVectorType->getElementCount(), 1);
@@ -476,11 +476,12 @@ static bool buildSelectInst(MachineIRBuilder &MIRBuilder,
   if (ReturnType->getOpcode() == SPIRV::OpTypeVector) {
     unsigned Bits = GR->getScalarOrVectorBitWidth(ReturnType);
     uint64_t AllOnes = APInt::getAllOnes(Bits).getZExtValue();
-    TrueConst = GR->getOrCreateConsIntVector(AllOnes, MIRBuilder, ReturnType);
-    FalseConst = GR->getOrCreateConsIntVector(0, MIRBuilder, ReturnType);
+    TrueConst =
+        GR->getOrCreateConsIntVector(AllOnes, MIRBuilder, ReturnType, true);
+    FalseConst = GR->getOrCreateConsIntVector(0, MIRBuilder, ReturnType, true);
   } else {
-    TrueConst = GR->buildConstantInt(1, MIRBuilder, ReturnType);
-    FalseConst = GR->buildConstantInt(0, MIRBuilder, ReturnType);
+    TrueConst = GR->buildConstantInt(1, MIRBuilder, ReturnType, true);
+    FalseConst = GR->buildConstantInt(0, MIRBuilder, ReturnType, true);
   }
 
   return MIRBuilder.buildSelect(ReturnRegister, SourceRegister, TrueConst,
@@ -580,8 +581,8 @@ static SPIRV::Scope::Scope getSPIRVScope(SPIRV::CLMemoryScope ClScope) {
 static Register buildConstantIntReg32(uint64_t Val,
                                       MachineIRBuilder &MIRBuilder,
                                       SPIRVGlobalRegistry *GR) {
-  return GR->buildConstantInt(Val, MIRBuilder,
-                              GR->getOrCreateSPIRVIntegerType(32, MIRBuilder));
+  return GR->buildConstantInt(
+      Val, MIRBuilder, GR->getOrCreateSPIRVIntegerType(32, MIRBuilder), true);
 }
 
 static Register buildScopeReg(Register CLScopeRegister,
@@ -1170,7 +1171,7 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call,
 
   Register Arg0;
   if (GroupBuiltin->HasBoolArg) {
-    SPIRVType *BoolType = GR->getOrCreateSPIRVBoolType(MIRBuilder);
+    SPIRVType *BoolType = GR->getOrCreateSPIRVBoolType(MIRBuilder, true);
     Register BoolReg = Call->Arguments[0];
     SPIRVType *BoolRegType = GR->getSPIRVTypeForVReg(BoolReg);
     if (!BoolRegType)
@@ -1179,14 +1180,15 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call,
     if (ArgInstruction->getOpcode() == TargetOpcode::G_CONSTANT) {
       if (BoolRegType->getOpcode() != SPIRV::OpTypeBool)
         Arg0 = GR->buildConstantInt(getIConstVal(BoolReg, MRI), MIRBuilder,
-                                    BoolType);
+                                    BoolType, true);
     } else {
       if (BoolRegType->getOpcode() == SPIRV::OpTypeInt) {
         Arg0 = MRI->createGenericVirtualRegister(LLT::scalar(1));
         MRI->setRegClass(Arg0, &SPIRV::iIDRegClass);
         GR->assignSPIRVTypeToVReg(BoolType, Arg0, MIRBuilder.getMF());
-        MIRBuilder.buildICmp(CmpInst::ICMP_NE, Arg0, BoolReg,
-                             GR->buildConstantInt(0, MIRBuilder, BoolRegType));
+        MIRBuilder.buildICmp(
+            CmpInst::ICMP_NE, Arg0, BoolReg,
+            GR->buildConstantInt(0, MIRBuilder, BoolRegType, true));
         insertAssignInstr(Arg0, nullptr, BoolType, GR, MIRBuilder,
                           MIRBuilder.getMF().getRegInfo());
       } else if (BoolRegType->getOpcode() != SPIRV::OpTypeBool) {
@@ -1231,7 +1233,7 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call,
         LLT::fixed_vector(VecLen, MRI->getType(ElemReg)));
     MRI->setRegClass(VecReg, &SPIRV::vIDRegClass);
     SPIRVType *VecType =
-        GR->getOrCreateSPIRVVectorType(ElemType, VecLen, MIRBuilder);
+        GR->getOrCreateSPIRVVectorType(ElemType, VecLen, MIRBuilder, true);
     GR->assignSPIRVTypeToVReg(VecType, VecReg, MIRBuilder.getMF());
     auto MIB =
         MIRBuilder.buildInstr(TargetOpcode::G_BUILD_VECTOR).addDef(VecReg);
@@ -1475,11 +1477,11 @@ static bool genWorkgroupQuery(const SPIRV::IncomingCall *Call,
       ToTruncate = DefaultReg;
     }
     auto NewRegister =
-        GR->buildConstantInt(DefaultValue, MIRBuilder, PointerSizeType);
+        GR->buildConstantInt(DefaultValue, MIRBuilder, PointerSizeType, true);
     MIRBuilder.buildCopy(DefaultReg, NewRegister);
   } else { // If it could be in range, we need to load from the given builtin.
     auto Vec3Ty =
-        GR->getOrCreateSPIRVVectorType(PointerSizeType, 3, MIRBuilder);
+        GR->getOrCreateSPIRVVectorType(PointerSizeType, 3, MIRBuilder, true);
     Register LoadedVector =
         buildBuiltinVariableLoad(MIRBuilder, Vec3Ty, GR, BuiltinValue,
                                  LLT::fixed_vector(3, PointerSize));
@@ -1502,7 +1504,7 @@ static bool genWorkgroupQuery(const SPIRV::IncomingCall *Call,
                         *MRI);
 
       auto IndexType = GR->getSPIRVTypeForVReg(IndexRegister);
-      auto BoolType = GR->getOrCreateSPIRVBoolType(MIRBuilder);
+      auto BoolType = GR->getOrCreateSPIRVBoolType(MIRBuilder, true);
 
       Register CompareRegister =
           MRI->createGenericVirtualRegister(LLT::scalar(1));
@@ -1510,13 +1512,14 @@ static bool genWorkgroupQuery(const SPIRV::IncomingCall *Call,
       GR->assignSPIRVTypeToVReg(BoolType, CompareRegister, MIRBuilder.getMF());
 
       // Use G_ICMP to check if idxVReg < 3.
-      MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CompareRegister, IndexRegister,
-                           GR->buildConstantInt(3, MIRBuilder, IndexType));
+      MIRBuilder.buildICmp(
+          CmpInst::ICMP_ULT, CompareRegister, IndexRegister,
+          GR->buildConstantInt(3, MIRBuilder, IndexType, true));
 
       // Get constant for the default value (0 or 1 depending on which
       // function).
       Register DefaultRegister =
-          GR->buildConstantInt(DefaultValue, MIRBuilder, PointerSizeType);
+          GR->buildConstantInt(DefaultValue, MIRBuilder, PointerSizeType, true);
 
       // Get a register for the selection result (possibly a new temporary one).
       Register SelectionResult = Call->ReturnRegister;
@@ -1830,7 +1833,7 @@ static bool generateImageSizeQueryInst(const SPIRV::IncomingCall *Call,
     MIRBuilder.getMRI()->setRegClass(QueryResult, &SPIRV::vIDRegClass);
     SPIRVType *IntTy = GR->getOrCreateSPIRVIntegerType(32, MIRBuilder);
     QueryResultType = GR->getOrCreateSPIRVVectorType(
-        IntTy, NumActualRetComponents, MIRBuilder);
+        IntTy, NumActualRetComponents, MIRBuilder, true);
     GR->assignSPIRVTypeToVReg(QueryResultType, QueryResult, MIRBuilder.getMF());
   }
   bool IsDimBuf = ImgType->getOperand(2).getImm() == SPIRV::Dim::DIM_Buffer;
@@ -1987,7 +1990,7 @@ static bool generateReadImageInst(const StringRef DemangledCall,
 
     if (Call->ReturnType->getOpcode() != SPIRV::OpTypeVector) {
       SPIRVType *TempType =
-          GR->getOrCreateSPIRVVectorType(Call->ReturnType, 4, MIRBuilder);
+          GR->getOrCreateSPIRVVectorType(Call->ReturnType, 4, MIRBuilder, true);
       Register TempRegister =
           MRI->createGenericVirtualRegister(GR->getRegType(TempType));
       MRI->setRegClass(TempRegister, GR->getRegClass(TempType));
@@ -2085,7 +2088,7 @@ static bool generateSampleImageInst(const StringRef DemangledCall,
     SPIRVType *Type =
         Call->ReturnType
             ? Call->ReturnType
-            : GR->getOrCreateSPIRVTypeByName(ReturnType, MIRBuilder);
+            : GR->getOrCreateSPIRVTypeByName(ReturnType, MIRBuilder, true);
     if (!Type) {
       std::string DiagMsg =
           "Unable to recognize SPIRV type name: " + ReturnType;
@@ -2294,7 +2297,8 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call,
       unsigned BitWidth = GR->getPointerSize() == 64 ? 64 : 32;
       Type *BaseTy = IntegerType::get(MF.getFunction().getContext(), BitWidth);
       Type *FieldTy = ArrayType::get(BaseTy, Size);
-      SPIRVType *SpvFieldTy = GR->getOrCreateSPIRVType(FieldTy, MIRBuilder);
+      SPIRVType *SpvFieldTy = GR->getOrCreateSPIRVType(
+          FieldTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
       GlobalWorkSize = MRI->createVirtualRegister(&SPIRV::iIDRegClass);
       GR->assignSPIRVTypeToVReg(SpvFieldTy, GlobalWorkSize, MF);
       MIRBuilder.buildInstr(SPIRV::OpLoad)
@@ -2306,7 +2310,7 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call,
       Const = GR->getOrCreateConstIntArray(0, Size, *MIRBuilder.getInsertPt(),
                                            SpvFieldTy, *ST.getInstrInfo());
     } else {
-      Const = GR->buildConstantInt(0, MIRBuilder, SpvTy);
+      Const = GR->buildConstantInt(0, MIRBuilder, SpvTy, true);
     }
     if (!LocalWorkSize.isValid())
       LocalWorkSize = Const;
@@ -2332,7 +2336,8 @@ getOrCreateSPIRVDeviceEventPointer(MachineIRBuilder &MIRBuilder,
   LLVMContext &Context = MIRBuilder.getMF().getFunction().getContext();
   unsigned SC1 = storageClassToAddressSpace(SPIRV::StorageClass::Generic);
   Type *PtrType = PointerType::get(Context, SC1);
-  return GR->getOrCreateSPIRVType(PtrType, MIRBuilder);
+  return GR->getOrCreateSPIRVType(PtrType, MIRBuilder,
+                                  SPIRV::AccessQualifier::ReadWrite, true);
 }
 
 static bool buildEnqueueKernel(const SPIRV::IncomingCall *Call,
@@ -2481,7 +2486,7 @@ static bool generateAsyncCopy(const SPIRV::IncomingCall *Call,
     SPIRVType *NewType =
         Call->ReturnType->getOpcode() == SPIRV::OpTypeEvent
             ? nullptr
-            : GR->getOrCreateSPIRVTypeByName("spirv.Event", MIRBuilder);
+            : GR->getOrCreateSPIRVTypeByName("spirv.Event", MIRBuilder, true);
     Register TypeReg = GR->getSPIRVTypeID(NewType ? NewType : Call->ReturnType);
     unsigned NumArgs = Call->Arguments.size();
     Register EventReg = Call->Arguments[NumArgs - 1];
@@ -2984,12 +2989,13 @@ static SPIRVType *getCoopMatrType(const TargetExtType *ExtensionType,
   assert(ExtensionType->getNumTypeParameters() == 1 &&
          "SPIR-V coop matrices builtin type must have a type parameter!");
   const SPIRVType *ElemType =
-      GR->getOrCreateSPIRVType(ExtensionType->getTypeParameter(0), MIRBuilder);
+      GR->getOrCreateSPIRVType(ExtensionType->getTypeParameter(0), MIRBuilder,
+                               SPIRV::AccessQualifier::ReadWrite, true);
   // Create or get an existing type from GlobalRegistry.
   return GR->getOrCreateOpTypeCoopMatr(
       MIRBuilder, ExtensionType, ElemType, ExtensionType->getIntParameter(0),
       ExtensionType->getIntParameter(1), ExtensionType->getIntParameter(2),
-      ExtensionType->getIntParameter(3));
+      ExtensionType->getIntParameter(3), true);
 }
 
 static SPIRVType *
@@ -2999,7 +3005,8 @@ getImageType(const TargetExtType *ExtensionType,
   assert(ExtensionType->getNumTypeParameters() == 1 &&
          "SPIR-V image builtin type must have sampled type parameter!");
   const SPIRVType *SampledType =
-      GR->getOrCreateSPIRVType(ExtensionType->getTypeParameter(0), MIRBuilder);
+      GR->getOrCreateSPIRVType(ExtensionType->getTypeParameter(0), MIRBuilder,
+                               SPIRV::AccessQualifier::ReadWrite, true);
   assert((ExtensionType->getNumIntParameters() == 7 ||
           ExtensionType->getNumIntParameters() == 6) &&
          "Invalid number of parameters for SPIR-V image builtin!");
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 78f6b18..b5a3b19 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -209,13 +209,15 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
   // If OriginalArgType is non-pointer, use the OriginalArgType (the type cannot
   // be legally reassigned later).
   if (!isPointerTy(OriginalArgType))
-    return GR->getOrCreateSPIRVType(OriginalArgType, MIRBuilder, ArgAccessQual);
+    return GR->getOrCreateSPIRVType(OriginalArgType, MIRBuilder, ArgAccessQual,
+                                    true);
 
   Argument *Arg = F.getArg(ArgIdx);
   Type *ArgType = Arg->getType();
   if (isTypedPointerTy(ArgType)) {
     SPIRVType *ElementType = GR->getOrCreateSPIRVType(
-        cast<TypedPointerType>(ArgType)->getElementType(), MIRBuilder);
+        cast<TypedPointerType>(ArgType)->getElementType(), MIRBuilder,
+        SPIRV::AccessQualifier::ReadWrite, true);
     return GR->getOrCreateSPIRVPointerType(
         ElementType, MIRBuilder,
         addressSpaceToStorageClass(getPointerAddressSpace(ArgType), ST));
@@ -231,7 +233,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
   // type.
   if (hasPointeeTypeAttr(Arg)) {
     SPIRVType *ElementType =
-        GR->getOrCreateSPIRVType(getPointeeTypeByAttr(Arg), MIRBuilder);
+        GR->getOrCreateSPIRVType(getPointeeTypeByAttr(Arg), MIRBuilder,
+                                 SPIRV::AccessQualifier::ReadWrite, true);
     return GR->getOrCreateSPIRVPointerType(
         ElementType, MIRBuilder,
         addressSpaceToStorageClass(getPointerAddressSpace(ArgType), ST));
@@ -245,7 +248,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
       Type *BuiltinType =
           cast<ConstantAsMetadata>(VMD->getMetadata())->getType();
       assert(BuiltinType->isTargetExtTy() && "Expected TargetExtType");
-      return GR->getOrCreateSPIRVType(BuiltinType, MIRBuilder, ArgAccessQual);
+      return GR->getOrCreateSPIRVType(BuiltinType, MIRBuilder, ArgAccessQual,
+                                      true);
     }
 
     // Check if this is spv_assign_ptr_type assigning pointer element type.
@@ -255,7 +259,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
     MetadataAsValue *VMD = cast<MetadataAsValue>(II->getOperand(1));
     Type *ElementTy =
         toTypedPointer(cast<ConstantAsMetadata>(VMD->getMetadata())->getType());
-    SPIRVType *ElementType = GR->getOrCreateSPIRVType(ElementTy, MIRBuilder);
+    SPIRVType *ElementType = GR->getOrCreateSPIRVType(
+        ElementTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
     return GR->getOrCreateSPIRVPointerType(
         ElementType, MIRBuilder,
         addressSpaceToStorageClass(
@@ -265,7 +270,7 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
   // Replace PointerType with TypedPointerType to be able to map SPIR-V types to
   // LLVM types in a consistent manner
   return GR->getOrCreateSPIRVType(toTypedPointer(OriginalArgType), MIRBuilder,
-                                  ArgAccessQual);
+                                  ArgAccessQual, true);
 }
 
 static SPIRV::ExecutionModel::ExecutionModel
@@ -405,7 +410,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
       FRetTy = DerivedTy;
     }
   }
-  SPIRVType *RetTy = GR->getOrCreateSPIRVType(FRetTy, MIRBuilder);
+  SPIRVType *RetTy = GR->getOrCreateSPIRVType(
+      FRetTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
   FTy = fixFunctionTypeIfPtrArgs(GR, F, FTy, RetTy, ArgTypeVRegs);
   SPIRVType *FuncTy = GR->getOrCreateOpTypeFunctionWithArgs(
       FTy, RetTy, ArgTypeVRegs, MIRBuilder);
@@ -486,10 +492,12 @@ void SPIRVCallLowering::produceIndirectPtrTypes(
   // Create indirect call data types if any
   MachineFunction &MF = MIRBuilder.getMF();
   for (auto const &IC : IndirectCalls) {
-    SPIRVType *SpirvRetTy = GR->getOrCreateSPIRVType(IC.RetTy, MIRBuilder);
+    SPIRVType *SpirvRetTy = GR->getOrCreateSPIRVType(
+        IC.RetTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
     SmallVector<SPIRVType *, 4> SpirvArgTypes;
     for (size_t i = 0; i < IC.ArgTys.size(); ++i) {
-      SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(IC.ArgTys[i], MIRBuilder);
+      SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(
+          IC.ArgTys[i], MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
       SpirvArgTypes.push_back(SPIRVTy);
       if (!GR->getSPIRVTypeForVReg(IC.ArgRegs[i]))
         GR->assignSPIRVTypeToVReg(SPIRVTy, IC.ArgRegs[i], MF);
@@ -557,10 +565,12 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
               RetTy =
                   TypedPointerType::get(ElemTy, PtrRetTy->getAddressSpace());
         }
-        setRegClassType(ResVReg, RetTy, GR, MIRBuilder);
+        setRegClassType(ResVReg, RetTy, GR, MIRBuilder,
+                        SPIRV::AccessQualifier::ReadWrite, true);
       }
     } else {
-      ResVReg = createVirtualRegister(OrigRetTy, GR, MIRBuilder);
+      ResVReg = createVirtualRegister(OrigRetTy, GR, MIRBuilder,
+                                      SPIRV::AccessQualifier::ReadWrite, true);
     }
     SmallVector<Register, 8> ArgVRegs;
     for (auto Arg : Info.OrigArgs) {
@@ -584,7 +594,8 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
           ArgTy = Arg.Ty;
         }
         if (ArgTy) {
-          SpvType = GR->getOrCreateSPIRVType(ArgTy, MIRBuilder);
+          SpvType = GR->getOrCreateSPIRVType(
+              ArgTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
           GR->assignSPIRVTypeToVReg(SpvType, ArgReg, MF);
         }
       }
@@ -669,7 +680,8 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // Make sure there's a valid return reg, even for functions returning void.
   if (!ResVReg.isValid())
     ResVReg = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass);
-  SPIRVType *RetType = GR->assignTypeToVReg(OrigRetTy, ResVReg, MIRBuilder);
+  SPIRVType *RetType = GR->assignTypeToVReg(
+      OrigRetTy, ResVReg, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
 
   // Emit the call instruction and its args.
   auto MIB = MIRBuilder.buildInstr(CallOp)
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
index b98cef0..ee98af5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
@@ -193,7 +193,8 @@ bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) {
     };
 
     const SPIRVType *VoidTy =
-        GR->getOrCreateSPIRVType(Type::getVoidTy(*Context), MIRBuilder);
+        GR->getOrCreateSPIRVType(Type::getVoidTy(*Context), MIRBuilder,
+                                 SPIRV::AccessQualifier::ReadWrite, false);
 
     const auto EmitDIInstruction =
         [&](SPIRV::NonSemanticExtInst::NonSemanticExtInst Inst,
@@ -217,7 +218,8 @@ bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) {
         };
 
     const SPIRVType *I32Ty =
-        GR->getOrCreateSPIRVType(Type::getInt32Ty(*Context), MIRBuilder);
+        GR->getOrCreateSPIRVType(Type::getInt32Ty(*Context), MIRBuilder,
+                                 SPIRV::AccessQualifier::ReadWrite, false);
 
     const Register DwarfVersionReg =
         GR->buildConstantInt(DwarfVersion, MIRBuilder, I32Ty, false);
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index abc49b0..0ed414e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -244,7 +244,8 @@ SPIRVGlobalRegistry::getOrCreateConstIntReg(uint64_t Val, SPIRVType *SpvType,
         CurMF->getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
     CurMF->getRegInfo().setRegClass(Res, &SPIRV::iIDRegClass);
     if (MIRBuilder)
-      assignTypeToVReg(LLVMIntTy, Res, *MIRBuilder);
+      assignTypeToVReg(LLVMIntTy, Res, *MIRBuilder,
+                       SPIRV::AccessQualifier::ReadWrite, true);
     else
       assignIntTypeToVReg(BitWidth, Res, *I, *TII);
     DT.add(CI, CurMF, Res);
@@ -271,7 +272,8 @@ SPIRVGlobalRegistry::getOrCreateConstFloatReg(APFloat Val, SPIRVType *SpvType,
         CurMF->getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
     CurMF->getRegInfo().setRegClass(Res, &SPIRV::fIDRegClass);
     if (MIRBuilder)
-      assignTypeToVReg(LLVMFloatTy, Res, *MIRBuilder);
+      assignTypeToVReg(LLVMFloatTy, Res, *MIRBuilder,
+                       SPIRV::AccessQualifier::ReadWrite, true);
     else
       assignFloatTypeToVReg(BitWidth, Res, *I, *TII);
     DT.add(CI, CurMF, Res);
@@ -410,7 +412,8 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val,
   auto &Ctx = MF.getFunction().getContext();
   if (!SpvType) {
     const Type *LLVMFPTy = Type::getFloatTy(Ctx);
-    SpvType = getOrCreateSPIRVType(LLVMFPTy, MIRBuilder);
+    SpvType = getOrCreateSPIRVType(LLVMFPTy, MIRBuilder,
+                                   SPIRV::AccessQualifier::ReadWrite, true);
   }
   // Find a constant in DT or build a new one.
   const auto ConstFP = ConstantFP::get(Ctx, Val);
@@ -651,9 +654,10 @@ Register SPIRVGlobalRegistry::buildConstantSampler(
     MachineIRBuilder &MIRBuilder, SPIRVType *SpvType) {
   SPIRVType *SampTy;
   if (SpvType)
-    SampTy = getOrCreateSPIRVType(getTypeForSPIRVType(SpvType), MIRBuilder);
-  else if ((SampTy = getOrCreateSPIRVTypeByName("opencl.sampler_t",
-                                                MIRBuilder)) == nullptr)
+    SampTy = getOrCreateSPIRVType(getTypeForSPIRVType(SpvType), MIRBuilder,
+                                  SPIRV::AccessQualifier::ReadWrite, true);
+  else if ((SampTy = getOrCreateSPIRVTypeByName("opencl.sampler_t", MIRBuilder,
+                                                false)) == nullptr)
     report_fatal_error("Unable to recognize SPIRV type name: opencl.sampler_t");
 
   auto Sampler =
@@ -878,9 +882,9 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeOpaque(const StructType *Ty,
   });
 }
 
-SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(const StructType *Ty,
-                                                MachineIRBuilder &MIRBuilder,
-                                                bool EmitIR) {
+SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(
+    const StructType *Ty, MachineIRBuilder &MIRBuilder,
+    SPIRV::AccessQualifier::AccessQualifier AccQual, bool EmitIR) {
   SmallVector<Register, 4> FieldTypes;
   constexpr unsigned MaxWordCount = UINT16_MAX;
   const size_t NumElements = Ty->getNumElements();
@@ -894,7 +898,8 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(const StructType *Ty,
   }
 
   for (const auto &Elem : Ty->elements()) {
-    SPIRVType *ElemTy = findSPIRVType(toTypedPointer(Elem), MIRBuilder);
+    SPIRVType *ElemTy =
+        findSPIRVType(toTypedPointer(Elem), MIRBuilder, AccQual, EmitIR);
     assert(ElemTy && ElemTy->getOpcode() != SPIRV::OpTypeVoid &&
            "Invalid struct element type");
     FieldTypes.push_back(getSPIRVTypeID(ElemTy));
@@ -1041,26 +1046,27 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
   if (Ty->isVoidTy())
     return getOpTypeVoid(MIRBuilder);
   if (Ty->isVectorTy()) {
-    SPIRVType *El =
-        findSPIRVType(cast<FixedVectorType>(Ty)->getElementType(), MIRBuilder);
+    SPIRVType *El = findSPIRVType(cast<FixedVectorType>(Ty)->getElementType(),
+                                  MIRBuilder, AccQual, EmitIR);
     return getOpTypeVector(cast<FixedVectorType>(Ty)->getNumElements(), El,
                            MIRBuilder);
   }
   if (Ty->isArrayTy()) {
-    SPIRVType *El = findSPIRVType(Ty->getArrayElementType(), MIRBuilder);
+    SPIRVType *El =
+        findSPIRVType(Ty->getArrayElementType(), MIRBuilder, AccQual, EmitIR);
     return getOpTypeArray(Ty->getArrayNumElements(), El, MIRBuilder, EmitIR);
   }
   if (auto SType = dyn_cast<StructType>(Ty)) {
     if (SType->isOpaque())
       return getOpTypeOpaque(SType, MIRBuilder);
-    return getOpTypeStruct(SType, MIRBuilder, EmitIR);
+    return getOpTypeStruct(SType, MIRBuilder, AccQual, EmitIR);
   }
   if (auto FType = dyn_cast<FunctionType>(Ty)) {
-    SPIRVType *RetTy = findSPIRVType(FType->getReturnType(), MIRBuilder);
+    SPIRVType *RetTy =
+        findSPIRVType(FType->getReturnType(), MIRBuilder, AccQual, EmitIR);
     SmallVector<SPIRVType *, 4> ParamTypes;
-    for (const auto &t : FType->params()) {
-      ParamTypes.push_back(findSPIRVType(t, MIRBuilder));
-    }
+    for (const auto &ParamTy : FType->params())
+      ParamTypes.push_back(findSPIRVType(ParamTy, MIRBuilder, AccQual, EmitIR));
     return getOpTypeFunction(RetTy, ParamTypes, MIRBuilder);
   }
 
@@ -1401,7 +1407,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeSampledImage(
 SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeCoopMatr(
     MachineIRBuilder &MIRBuilder, const TargetExtType *ExtensionType,
     const SPIRVType *ElemType, uint32_t Scope, uint32_t Rows, uint32_t Columns,
-    uint32_t Use) {
+    uint32_t Use, bool EmitIR) {
   Register ResVReg = DT.find(ExtensionType, &MIRBuilder.getMF());
   if (ResVReg.isValid())
     return MIRBuilder.getMF().getRegInfo().getUniqueVRegDef(ResVReg);
@@ -1411,10 +1417,10 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeCoopMatr(
       MIRBuilder.buildInstr(SPIRV::OpTypeCooperativeMatrixKHR)
           .addDef(ResVReg)
           .addUse(getSPIRVTypeID(ElemType))
-          .addUse(buildConstantInt(Scope, MIRBuilder, SpvTypeInt32, true))
-          .addUse(buildConstantInt(Rows, MIRBuilder, SpvTypeInt32, true))
-          .addUse(buildConstantInt(Columns, MIRBuilder, SpvTypeInt32, true))
-          .addUse(buildConstantInt(Use, MIRBuilder, SpvTypeInt32, true));
+          .addUse(buildConstantInt(Scope, MIRBuilder, SpvTypeInt32, EmitIR))
+          .addUse(buildConstantInt(Rows, MIRBuilder, SpvTypeInt32, EmitIR))
+          .addUse(buildConstantInt(Columns, MIRBuilder, SpvTypeInt32, EmitIR))
+          .addUse(buildConstantInt(Use, MIRBuilder, SpvTypeInt32, EmitIR));
   DT.add(ExtensionType, &MIRBuilder.getMF(), ResVReg);
   return SpirvTy;
 }
@@ -1441,7 +1447,7 @@ SPIRVGlobalRegistry::checkSpecialInstr(const SPIRV::SpecialTypeDescriptor &TD,
 
 // Returns nullptr if unable to recognize SPIRV type name
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName(
-    StringRef TypeStr, MachineIRBuilder &MIRBuilder,
+    StringRef TypeStr, MachineIRBuilder &MIRBuilder, bool EmitIR,
     SPIRV::StorageClass::StorageClass SC,
     SPIRV::AccessQualifier::AccessQualifier AQ) {
   unsigned VecElts = 0;
@@ -1451,7 +1457,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName(
   if (hasBuiltinTypePrefix(TypeStr))
     return getOrCreateSPIRVType(SPIRV::parseBuiltinTypeNameToTargetExtType(
                                     TypeStr.str(), MIRBuilder.getContext()),
-                                MIRBuilder, AQ);
+                                MIRBuilder, AQ, true);
 
   // Parse type name in either "typeN" or "type vector[N]" format, where
   // N is the number of elements of the vector.
@@ -1462,7 +1468,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName(
     // Unable to recognize SPIRV type name
     return nullptr;
 
-  auto SpirvTy = getOrCreateSPIRVType(Ty, MIRBuilder, AQ);
+  auto SpirvTy = getOrCreateSPIRVType(Ty, MIRBuilder, AQ, true);
 
   // Handle "type*" or  "type* vector[N]".
   if (TypeStr.starts_with("*")) {
@@ -1478,7 +1484,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName(
   }
   TypeStr.getAsInteger(10, VecElts);
   if (VecElts > 0)
-    SpirvTy = getOrCreateSPIRVVectorType(SpirvTy, VecElts, MIRBuilder);
+    SpirvTy = getOrCreateSPIRVVectorType(SpirvTy, VecElts, MIRBuilder, EmitIR);
 
   if (IsPtrToVec)
     SpirvTy = getOrCreateSPIRVPointerType(SpirvTy, MIRBuilder, SC);
@@ -1491,7 +1497,7 @@ SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(unsigned BitWidth,
                                                  MachineIRBuilder &MIRBuilder) {
   return getOrCreateSPIRVType(
       IntegerType::get(MIRBuilder.getMF().getFunction().getContext(), BitWidth),
-      MIRBuilder);
+      MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
 }
 
 SPIRVType *SPIRVGlobalRegistry::finishCreatingSPIRVType(const Type *LLVMTy,
@@ -1551,10 +1557,11 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVFloatType(
 }
 
 SPIRVType *
-SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder) {
+SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder,
+                                              bool EmitIR) {
   return getOrCreateSPIRVType(
       IntegerType::get(MIRBuilder.getMF().getFunction().getContext(), 1),
-      MIRBuilder);
+      MIRBuilder, SPIRV::AccessQualifier::ReadWrite, EmitIR);
 }
 
 SPIRVType *
@@ -1572,11 +1579,12 @@ SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineInstr &I,
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
-    SPIRVType *BaseType, unsigned NumElements, MachineIRBuilder &MIRBuilder) {
+    SPIRVType *BaseType, unsigned NumElements, MachineIRBuilder &MIRBuilder,
+    bool EmitIR) {
   return getOrCreateSPIRVType(
       FixedVectorType::get(const_cast<Type *>(getTypeForSPIRVType(BaseType)),
                            NumElements),
-      MIRBuilder);
+      MIRBuilder, SPIRV::AccessQualifier::ReadWrite, EmitIR);
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 0c94ec4..2c24ba7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -94,13 +94,11 @@ class SPIRVGlobalRegistry {
 
   // Add a new OpTypeXXX instruction without checking for duplicates.
   SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
-                             SPIRV::AccessQualifier::AccessQualifier AQ =
-                                 SPIRV::AccessQualifier::ReadWrite,
-                             bool EmitIR = true);
+                             SPIRV::AccessQualifier::AccessQualifier AQ,
+                             bool EmitIR);
   SPIRVType *findSPIRVType(const Type *Ty, MachineIRBuilder &MIRBuilder,
-                           SPIRV::AccessQualifier::AccessQualifier accessQual =
-                               SPIRV::AccessQualifier::ReadWrite,
-                           bool EmitIR = true);
+                           SPIRV::AccessQualifier::AccessQualifier accessQual,
+                           bool EmitIR);
   SPIRVType *
   restOfCreateSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
                         SPIRV::AccessQualifier::AccessQualifier AccessQual,
@@ -321,9 +319,8 @@ public:
   // and map it to the given VReg by creating an ASSIGN_TYPE instruction.
   SPIRVType *assignTypeToVReg(const Type *Type, Register VReg,
                               MachineIRBuilder &MIRBuilder,
-                              SPIRV::AccessQualifier::AccessQualifier AQ =
-                                  SPIRV::AccessQualifier::ReadWrite,
-                              bool EmitIR = true);
+                              SPIRV::AccessQualifier::AccessQualifier AQ,
+                              bool EmitIR);
   SPIRVType *assignIntTypeToVReg(unsigned BitWidth, Register VReg,
                                  MachineInstr &I, const SPIRVInstrInfo &TII);
   SPIRVType *assignFloatTypeToVReg(unsigned BitWidth, Register VReg,
@@ -344,9 +341,8 @@ public:
   // want to emit extra IR instructions there.
   SPIRVType *getOrCreateSPIRVType(const Type *Type,
                                   MachineIRBuilder &MIRBuilder,
-                                  SPIRV::AccessQualifier::AccessQualifier AQ =
-                                      SPIRV::AccessQualifier::ReadWrite,
-                                  bool EmitIR = true);
+                                  SPIRV::AccessQualifier::AccessQualifier AQ,
+                                  bool EmitIR);
 
   const Type *getTypeForSPIRVType(const SPIRVType *Ty) const {
     auto Res = SPIRVToLLVMType.find(Ty);
@@ -363,7 +359,7 @@ public:
   // corresponding to the given string containing the name of the builtin type.
   // Return nullptr if unable to recognize SPIRV type name from `TypeStr`.
   SPIRVType *getOrCreateSPIRVTypeByName(
-      StringRef TypeStr, MachineIRBuilder &MIRBuilder,
+      StringRef TypeStr, MachineIRBuilder &MIRBuilder, bool EmitIR,
       SPIRV::StorageClass::StorageClass SC = SPIRV::StorageClass::Function,
       SPIRV::AccessQualifier::AccessQualifier AQ =
           SPIRV::AccessQualifier::ReadWrite);
@@ -470,13 +466,14 @@ private:
                              MachineIRBuilder &MIRBuilder);
 
   SPIRVType *getOpTypeArray(uint32_t NumElems, SPIRVType *ElemType,
-                            MachineIRBuilder &MIRBuilder, bool EmitIR = true);
+                            MachineIRBuilder &MIRBuilder, bool EmitIR);
 
   SPIRVType *getOpTypeOpaque(const StructType *Ty,
                              MachineIRBuilder &MIRBuilder);
 
   SPIRVType *getOpTypeStruct(const StructType *Ty, MachineIRBuilder &MIRBuilder,
-                             bool EmitIR = true);
+                             SPIRV::AccessQualifier::AccessQualifier AccQual,
+                             bool EmitIR);
 
   SPIRVType *getOpTypePointer(SPIRV::StorageClass::StorageClass SC,
                               SPIRVType *ElemType, MachineIRBuilder &MIRBuilder,
@@ -518,7 +515,7 @@ private:
 
 public:
   Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder,
-                            SPIRVType *SpvType, bool EmitIR = true,
+                            SPIRVType *SpvType, bool EmitIR,
                             bool ZeroAsNull = true);
   Register getOrCreateConstInt(uint64_t Val, MachineInstr &I,
                                SPIRVType *SpvType, const SPIRVInstrInfo &TII,
@@ -539,7 +536,7 @@ public:
                                     SPIRVType *SpvType,
                                     const SPIRVInstrInfo &TII);
   Register getOrCreateConsIntVector(uint64_t Val, MachineIRBuilder &MIRBuilder,
-                                    SPIRVType *SpvType, bool EmitIR = true);
+                                    SPIRVType *SpvType, bool EmitIR);
   Register getOrCreateConstNullPtr(MachineIRBuilder &MIRBuilder,
                                    SPIRVType *SpvType);
   Register buildConstantSampler(Register Res, unsigned AddrMode, unsigned Param,
@@ -570,12 +567,14 @@ public:
                                   unsigned SPIRVOPcode, Type *LLVMTy);
   SPIRVType *getOrCreateSPIRVFloatType(unsigned BitWidth, MachineInstr &I,
                                        const SPIRVInstrInfo &TII);
-  SPIRVType *getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder);
+  SPIRVType *getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder,
+                                      bool EmitIR);
   SPIRVType *getOrCreateSPIRVBoolType(MachineInstr &I,
                                       const SPIRVInstrInfo &TII);
   SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType,
                                         unsigned NumElements,
-                                        MachineIRBuilder &MIRBuilder);
+                                        MachineIRBuilder &MIRBuilder,
+                                        bool EmitIR);
   SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType,
                                         unsigned NumElements, MachineInstr &I,
                                         const SPIRVInstrInfo &TII);
@@ -605,7 +604,8 @@ public:
                                        const TargetExtType *ExtensionType,
                                        const SPIRVType *ElemType,
                                        uint32_t Scope, uint32_t Rows,
-                                       uint32_t Columns, uint32_t Use);
+                                       uint32_t Columns, uint32_t Use,
+                                       bool EmitIR);
   SPIRVType *
   getOrCreateOpTypePipe(MachineIRBuilder &MIRBuilder,
                         SPIRV::AccessQualifier::AccessQualifier AccQual);
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index d5b81bf..c347dde 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -126,8 +126,7 @@ static void doInsertBitcast(const SPIRVSubtarget &STI, MachineRegisterInfo *MRI,
 
 static SPIRVType *createNewPtrType(SPIRVGlobalRegistry &GR, MachineInstr &I,
                                    SPIRVType *OpType, bool ReuseType,
-                                   bool EmitIR, SPIRVType *ResType,
-                                   const Type *ResTy) {
+                                   SPIRVType *ResType, const Type *ResTy) {
   SPIRV::StorageClass::StorageClass SC =
       static_cast<SPIRV::StorageClass::StorageClass>(
           OpType->getOperand(1).getImm());
@@ -135,7 +134,7 @@ static SPIRVType *createNewPtrType(SPIRVGlobalRegistry &GR, MachineInstr &I,
   SPIRVType *NewBaseType =
       ReuseType ? ResType
                 : GR.getOrCreateSPIRVType(
-                      ResTy, MIB, SPIRV::AccessQualifier::ReadWrite, EmitIR);
+                      ResTy, MIB, SPIRV::AccessQualifier::ReadWrite, false);
   return GR.getOrCreateSPIRVPointerType(NewBaseType, MIB, SC);
 }
 
@@ -166,7 +165,7 @@ static void validatePtrTypes(const SPIRVSubtarget &STI,
   // There is a type mismatch between results and operand types
   // and we insert a bitcast before the instruction to keep SPIR-V code valid
   SPIRVType *NewPtrType =
-      createNewPtrType(GR, I, OpType, IsSameMF, false, ResType, ResTy);
+      createNewPtrType(GR, I, OpType, IsSameMF, ResType, ResTy);
   if (!GR.isBitcastCompatible(NewPtrType, OpType))
     report_fatal_error(
         "insert validation bitcast: incompatible result and operand types");
@@ -192,7 +191,7 @@ static void validateGroupWaitEventsPtr(const SPIRVSubtarget &STI,
   // Insert a bitcast before the instruction to keep SPIR-V code valid.
   LLVMContext &Context = MF->getFunction().getContext();
   SPIRVType *NewPtrType =
-      createNewPtrType(GR, I, OpType, false, true, nullptr,
+      createNewPtrType(GR, I, OpType, false, nullptr,
                        TargetExtType::get(Context, "spirv.Event"));
   doInsertBitcast(STI, MRI, GR, I, OpReg, OpIdx, NewPtrType);
 }
@@ -216,7 +215,8 @@ static void validateLifetimeStart(const SPIRVSubtarget &STI,
   MachineIRBuilder MIB(I);
   LLVMContext &Context = MF->getFunction().getContext();
   SPIRVType *ElemType =
-      GR.getOrCreateSPIRVType(IntegerType::getInt8Ty(Context), MIB);
+      GR.getOrCreateSPIRVType(IntegerType::getInt8Ty(Context), MIB,
+                              SPIRV::AccessQualifier::ReadWrite, false);
   SPIRVType *NewPtrType = GR.getOrCreateSPIRVPointerType(ElemType, MIB, SC);
   doInsertBitcast(STI, MRI, GR, I, PtrReg, 0, NewPtrType);
 }
@@ -492,12 +492,12 @@ void SPIRVTargetLowering::finalizeLowering(MachineFunction &MF) const {
           SPIRVType *Int32Type = GR.getOrCreateSPIRVIntegerType(32, MIB);
           SPIRVType *RetType = MRI->getVRegDef(MI.getOperand(1).getReg());
           assert(RetType && "Expected return type");
-          validatePtrTypes(
-              STI, MRI, GR, MI, MI.getNumOperands() - 1,
-              RetType->getOpcode() != SPIRV::OpTypeVector
-                  ? Int32Type
-                  : GR.getOrCreateSPIRVVectorType(
-                        Int32Type, RetType->getOperand(2).getImm(), MIB));
+          validatePtrTypes(STI, MRI, GR, MI, MI.getNumOperands() - 1,
+                           RetType->getOpcode() != SPIRV::OpTypeVector
+                               ? Int32Type
+                               : GR.getOrCreateSPIRVVectorType(
+                                     Int32Type, RetType->getOperand(2).getImm(),
+                                     MIB, false));
         } break;
         case SPIRV::OpenCLExtInst::fract:
         case SPIRV::OpenCLExtInst::modf:
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 3566bc5..c52b67e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3399,9 +3399,10 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
   MachineIRBuilder MIRBuilder(I);
   SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
   SPIRVType *I64Type = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
-  SPIRVType *I64x2Type = GR.getOrCreateSPIRVVectorType(I64Type, 2, MIRBuilder);
+  SPIRVType *I64x2Type =
+      GR.getOrCreateSPIRVVectorType(I64Type, 2, MIRBuilder, false);
   SPIRVType *Vec2ResType =
-      GR.getOrCreateSPIRVVectorType(BaseType, 2, MIRBuilder);
+      GR.getOrCreateSPIRVVectorType(BaseType, 2, MIRBuilder, false);
 
   std::vector<Register> PartialRegs;
 
@@ -3484,8 +3485,8 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
 
   // 1. Split int64 into 2 pieces using a bitcast
   MachineIRBuilder MIRBuilder(I);
-  SPIRVType *PostCastType =
-      GR.getOrCreateSPIRVVectorType(BaseType, 2 * ComponentCount, MIRBuilder);
+  SPIRVType *PostCastType = GR.getOrCreateSPIRVVectorType(
+      BaseType, 2 * ComponentCount, MIRBuilder, false);
   Register BitcastReg =
       MRI->createVirtualRegister(GR.getRegClass(PostCastType));
 
@@ -3562,8 +3563,8 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
     SelectOp = SPIRV::OpSelectSISCond;
     AddOp = SPIRV::OpIAddS;
   } else {
-    BoolType =
-        GR.getOrCreateSPIRVVectorType(BoolType, ComponentCount, MIRBuilder);
+    BoolType = GR.getOrCreateSPIRVVectorType(BoolType, ComponentCount,
+                                             MIRBuilder, false);
     NegOneReg =
         GR.getOrCreateConstVector((unsigned)-1, I, ResType, TII, ZeroAsNull);
     Reg0 = GR.getOrCreateConstVector(0, I, ResType, TII, ZeroAsNull);
@@ -3930,7 +3931,7 @@ bool SPIRVInstructionSelector::loadVec3BuiltinInputID(
   MachineIRBuilder MIRBuilder(I);
   const SPIRVType *U32Type = GR.getOrCreateSPIRVIntegerType(32, MIRBuilder);
   const SPIRVType *Vec3Ty =
-      GR.getOrCreateSPIRVVectorType(U32Type, 3, MIRBuilder);
+      GR.getOrCreateSPIRVVectorType(U32Type, 3, MIRBuilder, false);
   const SPIRVType *PtrType = GR.getOrCreateSPIRVPointerType(
       Vec3Ty, MIRBuilder, SPIRV::StorageClass::Input);
 
@@ -3979,7 +3980,7 @@ SPIRVType *SPIRVInstructionSelector::widenTypeToVec4(const SPIRVType *Type,
                                                      MachineInstr &I) const {
   MachineIRBuilder MIRBuilder(I);
   if (Type->getOpcode() != SPIRV::OpTypeVector)
-    return GR.getOrCreateSPIRVVectorType(Type, 4, MIRBuilder);
+    return GR.getOrCreateSPIRVVectorType(Type, 4, MIRBuilder, false);
 
   uint64_t VectorSize = Type->getOperand(2).getImm();
   if (VectorSize == 4)
@@ -3987,7 +3988,7 @@ SPIRVType *SPIRVInstructionSelector::widenTypeToVec4(const SPIRVType *Type,
 
   Register ScalarTypeReg = Type->getOperand(1).getReg();
   const SPIRVType *ScalarType = GR.getSPIRVTypeForVReg(ScalarTypeReg);
-  return GR.getOrCreateSPIRVVectorType(ScalarType, 4, MIRBuilder);
+  return GR.getOrCreateSPIRVVectorType(ScalarType, 4, MIRBuilder, false);
 }
 
 bool SPIRVInstructionSelector::loadHandleBeforePosition(
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index fa5e0a8..daa8ea5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -405,7 +405,8 @@ bool SPIRVLegalizerInfo::legalizeCustom(
       LLT ConvT = LLT::scalar(ST->getPointerSize());
       Type *LLVMTy = IntegerType::get(MI.getMF()->getFunction().getContext(),
                                       ST->getPointerSize());
-      SPIRVType *SpirvTy = GR->getOrCreateSPIRVType(LLVMTy, Helper.MIRBuilder);
+      SPIRVType *SpirvTy = GR->getOrCreateSPIRVType(
+          LLVMTy, Helper.MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true);
       Op0.setReg(convertPtrToInt(Reg0, ConvT, SpirvTy, Helper, MRI, GR));
       Op1.setReg(convertPtrToInt(Reg1, ConvT, SpirvTy, Helper, MRI, GR));
     }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 32f6af3..5d70b9c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -98,8 +98,9 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
               TargetExtConstTypes[SrcMI] = Const->getType();
             if (Const->isNullValue()) {
               MachineIRBuilder MIB(MF);
-              SPIRVType *ExtType =
-                  GR->getOrCreateSPIRVType(Const->getType(), MIB);
+              SPIRVType *ExtType = GR->getOrCreateSPIRVType(
+                  Const->getType(), MIB, SPIRV::AccessQualifier::ReadWrite,
+                  true);
               SrcMI->setDesc(STI.getInstrInfo()->get(SPIRV::OpConstantNull));
               SrcMI->addOperand(MachineOperand::CreateReg(
                   GR->getSPIRVTypeID(ExtType), false));
@@ -248,7 +249,8 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
       Register Def = MI.getOperand(0).getReg();
       Register Source = MI.getOperand(2).getReg();
       Type *ElemTy = getMDOperandAsType(MI.getOperand(3).getMetadata(), 0);
-      SPIRVType *BaseTy = GR->getOrCreateSPIRVType(ElemTy, MIB);
+      SPIRVType *BaseTy = GR->getOrCreateSPIRVType(
+          ElemTy, MIB, SPIRV::AccessQualifier::ReadWrite, true);
       SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
           BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
           addressSpaceToStorageClass(MI.getOperand(4).getImm(), *ST));
@@ -299,7 +301,8 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
       case TargetOpcode::G_CONSTANT: {
         MIB.setInsertPt(*MI->getParent(), MI);
         Type *Ty = MI->getOperand(1).getCImm()->getType();
-        SpvType = GR->getOrCreateSPIRVType(Ty, MIB);
+        SpvType = GR->getOrCreateSPIRVType(
+            Ty, MIB, SPIRV::AccessQualifier::ReadWrite, true);
         break;
       }
       case TargetOpcode::G_GLOBAL_VALUE: {
@@ -308,7 +311,8 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
         Type *ElementTy = toTypedPointer(GR->getDeducedGlobalValueType(Global));
         auto *Ty = TypedPointerType::get(ElementTy,
                                          Global->getType()->getAddressSpace());
-        SpvType = GR->getOrCreateSPIRVType(Ty, MIB);
+        SpvType = GR->getOrCreateSPIRVType(
+            Ty, MIB, SPIRV::AccessQualifier::ReadWrite, true);
         break;
       }
       case TargetOpcode::G_ANYEXT:
@@ -324,8 +328,8 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
               unsigned NumElements = GR->getScalarOrVectorComponentCount(Def);
               SpvType = GR->getOrCreateSPIRVIntegerType(ExpectedBW, MIB);
               if (NumElements > 1)
-                SpvType =
-                    GR->getOrCreateSPIRVVectorType(SpvType, NumElements, MIB);
+                SpvType = GR->getOrCreateSPIRVVectorType(SpvType, NumElements,
+                                                         MIB, true);
             }
           }
         }
@@ -431,7 +435,9 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpvType,
   assert((Ty || SpvType) && "Either LLVM or SPIRV type is expected.");
   MachineInstr *Def = MRI.getVRegDef(Reg);
   setInsertPtAfterDef(MIB, Def);
-  SpvType = SpvType ? SpvType : GR->getOrCreateSPIRVType(Ty, MIB);
+  SpvType = SpvType ? SpvType
+                    : GR->getOrCreateSPIRVType(
+                          Ty, MIB, SPIRV::AccessQualifier::ReadWrite, true);
   Register NewReg = MRI.createGenericVirtualRegister(MRI.getType(Reg));
   if (auto *RC = MRI.getRegClassOrNull(Reg)) {
     MRI.setRegClass(NewReg, RC);
@@ -518,7 +524,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
         Register Reg = MI.getOperand(1).getReg();
         MIB.setInsertPt(*MI.getParent(), MI.getIterator());
         Type *ElementTy = getMDOperandAsType(MI.getOperand(2).getMetadata(), 0);
-        SPIRVType *BaseTy = GR->getOrCreateSPIRVType(ElementTy, MIB);
+        SPIRVType *BaseTy = GR->getOrCreateSPIRVType(
+            ElementTy, MIB, SPIRV::AccessQualifier::ReadWrite, true);
         SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
             BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
             addressSpaceToStorageClass(MI.getOperand(3).getImm(), *ST));
@@ -737,9 +744,11 @@ insertInlineAsmProcess(MachineFunction &MF, SPIRVGlobalRegistry *GR,
     FunctionType *FTy = cast<FunctionType>(getMDOperandAsType(IAMD, 0));
     SmallVector<SPIRVType *, 4> ArgTypes;
     for (const auto &ArgTy : FTy->params())
-      ArgTypes.push_back(GR->getOrCreateSPIRVType(ArgTy, MIRBuilder));
+      ArgTypes.push_back(GR->getOrCreateSPIRVType(
+          ArgTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, true));
     SPIRVType *RetType =
-        GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder);
+        GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder,
+                                 SPIRV::AccessQualifier::ReadWrite, true);
     SPIRVType *FuncType = GR->getOrCreateOpTypeFunctionWithArgs(
         FTy, RetType, ArgTypes, MIRBuilder);
 
@@ -772,7 +781,8 @@ insertInlineAsmProcess(MachineFunction &MF, SPIRVGlobalRegistry *GR,
       DefReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
       MRI.setRegClass(DefReg, &SPIRV::iIDRegClass);
       SPIRVType *VoidType = GR->getOrCreateSPIRVType(
-          Type::getVoidTy(MF.getFunction().getContext()), MIRBuilder);
+          Type::getVoidTy(MF.getFunction().getContext()), MIRBuilder,
+          SPIRV::AccessQualifier::ReadWrite, true);
       GR->assignSPIRVTypeToVReg(VoidType, DefReg, MF);
     }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index ddc66f9..c55b735 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -738,9 +738,12 @@ void setRegClassType(Register Reg, SPIRVType *SpvType, SPIRVGlobalRegistry *GR,
 // no valid assigned class, set register LLT type and class according to the
 // SPIR-V type.
 void setRegClassType(Register Reg, const Type *Ty, SPIRVGlobalRegistry *GR,
-                     MachineIRBuilder &MIRBuilder, bool Force) {
-  setRegClassType(Reg, GR->getOrCreateSPIRVType(Ty, MIRBuilder), GR,
-                  MIRBuilder.getMRI(), MIRBuilder.getMF(), Force);
+                     MachineIRBuilder &MIRBuilder,
+                     SPIRV::AccessQualifier::AccessQualifier AccessQual,
+                     bool EmitIR, bool Force) {
+  setRegClassType(Reg,
+                  GR->getOrCreateSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR),
+                  GR, MIRBuilder.getMRI(), MIRBuilder.getMF(), Force);
 }
 
 // Create a virtual register and assign SPIR-V type to the register. Set
@@ -764,10 +767,12 @@ Register createVirtualRegister(SPIRVType *SpvType, SPIRVGlobalRegistry *GR,
 
 // Create a SPIR-V type, virtual register and assign SPIR-V type to the
 // register. Set register LLT type and class according to the SPIR-V type.
-Register createVirtualRegister(const Type *Ty, SPIRVGlobalRegistry *GR,
-                               MachineIRBuilder &MIRBuilder) {
-  return createVirtualRegister(GR->getOrCreateSPIRVType(Ty, MIRBuilder), GR,
-                               MIRBuilder);
+Register createVirtualRegister(
+    const Type *Ty, SPIRVGlobalRegistry *GR, MachineIRBuilder &MIRBuilder,
+    SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR) {
+  return createVirtualRegister(
+      GR->getOrCreateSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR), GR,
+      MIRBuilder);
 }
 
 // Return true if there is an opaque pointer type nested in the argument.
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index 552adf2..8706498 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -411,7 +411,9 @@ MachineInstr *getVRegDef(MachineRegisterInfo &MRI, Register Reg);
 bool getVacantFunctionName(Module &M, std::string &Name);
 
 void setRegClassType(Register Reg, const Type *Ty, SPIRVGlobalRegistry *GR,
-                     MachineIRBuilder &MIRBuilder, bool Force = false);
+                     MachineIRBuilder &MIRBuilder,
+                     SPIRV::AccessQualifier::AccessQualifier AccessQual,
+                     bool EmitIR, bool Force = false);
 void setRegClassType(Register Reg, const MachineInstr *SpvType,
                      SPIRVGlobalRegistry *GR, MachineRegisterInfo *MRI,
                      const MachineFunction &MF, bool Force = false);
@@ -422,8 +424,9 @@ Register createVirtualRegister(const MachineInstr *SpvType,
 Register createVirtualRegister(const MachineInstr *SpvType,
                                SPIRVGlobalRegistry *GR,
                                MachineIRBuilder &MIRBuilder);
-Register createVirtualRegister(const Type *Ty, SPIRVGlobalRegistry *GR,
-                               MachineIRBuilder &MIRBuilder);
+Register createVirtualRegister(
+    const Type *Ty, SPIRVGlobalRegistry *GR, MachineIRBuilder &MIRBuilder,
+    SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR);
 
 // Return true if there is an opaque pointer type nested in the argument.
 bool isNestedPointer(const Type *Ty);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index da4ef67..b24a45c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -121,7 +121,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
-  for (auto T : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
+  for (auto T : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64, MVT::v8f16}) {
+    if (!Subtarget->hasFP16() && T == MVT::v8f16) {
+      continue;
+    }
     // Don't expand the floating-point types to constant pools.
     setOperationAction(ISD::ConstantFP, T, Legal);
     // Expand floating-point comparisons.
@@ -140,18 +143,16 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     // Support minimum and maximum, which otherwise default to expand.
     setOperationAction(ISD::FMINIMUM, T, Legal);
     setOperationAction(ISD::FMAXIMUM, T, Legal);
-    // WebAssembly currently has no builtin f16 support.
-    setOperationAction(ISD::FP16_TO_FP, T, Expand);
-    setOperationAction(ISD::FP_TO_FP16, T, Expand);
+    // When experimental v8f16 support is enabled these instructions don't need
+    // to be expanded.
+    if (T != MVT::v8f16) {
+      setOperationAction(ISD::FP16_TO_FP, T, Expand);
+      setOperationAction(ISD::FP_TO_FP16, T, Expand);
+    }
     setLoadExtAction(ISD::EXTLOAD, T, MVT::f16, Expand);
     setTruncStoreAction(T, MVT::f16, Expand);
   }
 
-  if (Subtarget->hasFP16()) {
-    setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
-    setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
-  }
-
   // Expand unavailable integer operations.
   for (auto Op :
        {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU,
@@ -228,6 +229,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
                    MVT::v2f64})
       setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom);
 
+    if (Subtarget->hasFP16())
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom);
+
     // Support splatting
     for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
                    MVT::v2f64})
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 14acc62..c591e5e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -558,7 +558,7 @@ defm SHUFFLE :
 // Shuffles after custom lowering
 def wasm_shuffle_t : SDTypeProfile<1, 18, []>;
 def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>;
-foreach vec = StdVecs in {
+foreach vec = AllVecs in {
 // The @llvm.wasm.shuffle intrinsic has immediate arguments that become TargetConstants.
 def : Pat<(vec.vt (wasm_shuffle (vec.vt V128:$x), (vec.vt V128:$y),
             (i32 timm:$m0), (i32 timm:$m1),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 2519592f..40ae4ae 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -119,20 +119,20 @@ public:
   }
   void setBasePointerVreg(unsigned Reg) { BasePtrVreg = Reg; }
 
-  void stackifyVReg(MachineRegisterInfo &MRI, unsigned VReg) {
+  void stackifyVReg(MachineRegisterInfo &MRI, Register VReg) {
     assert(MRI.getUniqueVRegDef(VReg));
-    auto I = Register(VReg).virtRegIndex();
+    auto I = VReg.virtRegIndex();
     if (I >= VRegStackified.size())
       VRegStackified.resize(I + 1);
     VRegStackified.set(I);
   }
-  void unstackifyVReg(unsigned VReg) {
-    auto I = Register(VReg).virtRegIndex();
+  void unstackifyVReg(Register VReg) {
+    auto I = VReg.virtRegIndex();
     if (I < VRegStackified.size())
       VRegStackified.reset(I);
   }
-  bool isVRegStackified(unsigned VReg) const {
-    auto I = Register(VReg).virtRegIndex();
+  bool isVRegStackified(Register VReg) const {
+    auto I = VReg.virtRegIndex();
     if (I >= VRegStackified.size())
       return false;
     return VRegStackified.test(I);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 269becb..2e88a27 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39602,7 +39602,7 @@ static bool matchBinaryPermuteShuffle(
 static SDValue combineX86ShuffleChainWithExtract(
     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
     ArrayRef<const SDNode *> SrcNodes, bool AllowVariableCrossLaneMask,
-    bool AllowVariablePerLaneMask, SelectionDAG &DAG,
+    bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
     const X86Subtarget &Subtarget);
 
 /// Combine an arbitrary chain of shuffles into a single instruction if
@@ -39619,6 +39619,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                       ArrayRef<const SDNode *> SrcNodes,
                                       bool AllowVariableCrossLaneMask,
                                       bool AllowVariablePerLaneMask,
+                                      bool IsMaskedShuffle,
                                       SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
@@ -39626,6 +39627,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
          "Unexpected number of shuffle inputs!");
 
   SDLoc DL(Root);
+  unsigned RootOpc = Root.getOpcode();
   MVT RootVT = Root.getSimpleValueType();
   unsigned RootSizeInBits = RootVT.getSizeInBits();
   unsigned NumRootElts = RootVT.getVectorNumElements();
@@ -39665,17 +39667,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                      (RootVT.isFloatingPoint() && Depth >= 1) ||
                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
 
-  // Don't combine if we are a AVX512/EVEX target and the mask element size
-  // is different from the root element size - this would prevent writemasks
-  // from being reused.
-  bool IsMaskedShuffle = false;
-  if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
-    if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
-        Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
-      IsMaskedShuffle = true;
-    }
-  }
-
   // If we are shuffling a splat (and not introducing zeros) then we can just
   // use it directly. This works for smaller elements as well as they already
   // repeat across each mask element.
@@ -39710,7 +39701,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
     // to zero the upper subvectors.
     if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
-      if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+      if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
         return SDValue(); // Nothing to do!
       assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
              "Unexpected lane shuffle");
@@ -39767,7 +39758,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                         ScaledMask[1] == (ScaledMask[3] % 2));
 
     if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
-      if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+      if (Depth == 0 && RootOpc == X86ISD::SHUF128)
         return SDValue(); // Nothing to do!
       MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
       if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
@@ -39781,7 +39772,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
     // zero the upper half.
     if (isUndefOrZero(Mask[1])) {
-      if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+      if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
         return SDValue(); // Nothing to do!
       assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
       Res = CanonicalizeShuffleInput(RootVT, V1);
@@ -39795,7 +39786,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
     if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
         !Subtarget.hasAVX2()) {
-      if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+      if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
         return SDValue(); // Nothing to do!
       SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
       SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
@@ -39803,7 +39794,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
     }
 
-    if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
+    if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
       return SDValue(); // Nothing to do!
 
     // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
@@ -39820,7 +39811,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
           DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
     }
 
-    if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+    if (Depth == 0 && RootOpc == X86ISD::SHUF128)
       return SDValue(); // Nothing to do!
 
     // TODO - handle AVX512VL cases with X86ISD::SHUF128.
@@ -39903,14 +39894,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         if (V1.getValueType() == MaskVT &&
             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
             X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
-          if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
+          if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
             return SDValue(); // Nothing to do!
           Res = V1.getOperand(0);
           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
           return DAG.getBitcast(RootVT, Res);
         }
         if (Subtarget.hasAVX2()) {
-          if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
+          if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
             return SDValue(); // Nothing to do!
           Res = CanonicalizeShuffleInput(MaskVT, V1);
           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
@@ -39923,7 +39914,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                           DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
         (!IsMaskedShuffle ||
          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
-      if (Depth == 0 && Root.getOpcode() == Shuffle)
+      if (Depth == 0 && RootOpc == Shuffle)
         return SDValue(); // Nothing to do!
       Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
@@ -39935,7 +39926,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                  PermuteImm) &&
         (!IsMaskedShuffle ||
          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
-      if (Depth == 0 && Root.getOpcode() == Shuffle)
+      if (Depth == 0 && RootOpc == Shuffle)
         return SDValue(); // Nothing to do!
       Res = CanonicalizeShuffleInput(ShuffleVT, V1);
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
@@ -39955,7 +39946,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
                                  DAG) &&
           SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-        if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+        if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
           return SDValue(); // Nothing to do!
         Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
                           CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
@@ -39968,7 +39959,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
         V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
         V2.getScalarValueSizeInBits() <= 32) {
-      if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+      if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
         return SDValue(); // Nothing to do!
       PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
@@ -39985,7 +39976,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
                          ShuffleVT, UnaryShuffle) &&
       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
-    if (Depth == 0 && Root.getOpcode() == Shuffle)
+    if (Depth == 0 && RootOpc == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
     NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
@@ -39999,7 +39990,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                 AllowIntDomain, NewV1, NewV2, DL, DAG,
                                 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
-    if (Depth == 0 && Root.getOpcode() == Shuffle)
+    if (Depth == 0 && RootOpc == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
     NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
@@ -40017,7 +40008,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     uint64_t BitLen, BitIdx;
     if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
                             Zeroable)) {
-      if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
+      if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
         return SDValue(); // Nothing to do!
       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
@@ -40027,7 +40018,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
 
     if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
-      if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
+      if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
         return SDValue(); // Nothing to do!
       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
       V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
@@ -40047,7 +40038,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                         ShuffleSrcVT.getVectorNumElements();
       unsigned Opc =
           IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
-      if (Depth == 0 && Root.getOpcode() == Opc)
+      if (Depth == 0 && RootOpc == Opc)
         return SDValue(); // Nothing to do!
       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
       Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
@@ -40064,9 +40055,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
       // Bail if this was already a truncation or PACK node.
       // We sometimes fail to match PACK if we demand known undef elements.
-      if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
-                         Root.getOpcode() == X86ISD::PACKSS ||
-                         Root.getOpcode() == X86ISD::PACKUS))
+      if (Depth == 0 &&
+          (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
+           RootOpc == X86ISD::PACKUS))
         return SDValue(); // Nothing to do!
       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
@@ -40110,8 +40101,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
 
   // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
-  if ((UnaryShuffle && Root.getOpcode() == X86ISD::VPERMV) ||
-      Root.getOpcode() == X86ISD::VPERMV3)
+  if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
     AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
 
   bool MaskContainsZeros = isAnyZero(Mask);
@@ -40167,7 +40157,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // shuffle with the larger type.
     if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
             Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask,
-            AllowVariablePerLaneMask, DAG, Subtarget))
+            AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget))
       return WideShuffle;
 
     // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
@@ -40339,7 +40329,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // shuffle with the larger type.
   if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
           Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask,
-          AllowVariablePerLaneMask, DAG, Subtarget))
+          AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget))
     return WideShuffle;
 
   // If we have a dual input shuffle then lower to VPERMV3,
@@ -40378,7 +40368,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 static SDValue combineX86ShuffleChainWithExtract(
     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
     ArrayRef<const SDNode *> SrcNodes, bool AllowVariableCrossLaneMask,
-    bool AllowVariablePerLaneMask, SelectionDAG &DAG,
+    bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
     const X86Subtarget &Subtarget) {
   unsigned NumMaskElts = BaseMask.size();
   unsigned NumInputs = Inputs.size();
@@ -40504,10 +40494,10 @@ static SDValue combineX86ShuffleChainWithExtract(
   assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
          "WideRootSize mismatch");
 
-  if (SDValue WideShuffle =
-          combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
-                                 SrcNodes, AllowVariableCrossLaneMask,
-                                 AllowVariablePerLaneMask, DAG, Subtarget)) {
+  if (SDValue WideShuffle = combineX86ShuffleChain(
+          WideInputs, WideRoot, WideMask, Depth, SrcNodes,
+          AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
+          DAG, Subtarget)) {
     WideShuffle =
         extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
     return DAG.getBitcast(RootVT, WideShuffle);
@@ -41244,6 +41234,16 @@ static SDValue combineX86ShufflesRecursively(
     resolveTargetShuffleInputsAndMask(Ops, Mask);
   }
 
+  // If we are a AVX512/EVEX target the mask element size should match the root
+  // element size to allow writemasks to be reused.
+  bool IsMaskedShuffle = false;
+  if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
+    if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
+        Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
+      IsMaskedShuffle = true;
+    }
+  }
+
   // We can only combine unary and binary shuffle mask cases.
   if (Ops.size() <= 2) {
     // Minor canonicalization of the accumulated shuffle mask to make it easier
@@ -41268,7 +41268,7 @@ static SDValue combineX86ShufflesRecursively(
     // Try to combine into a single shuffle instruction.
     if (SDValue Shuffle = combineX86ShuffleChain(
             Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask,
-            AllowVariablePerLaneMask, DAG, Subtarget))
+            AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget))
       return Shuffle;
 
     // If all the operands come from the same larger vector, fallthrough and try
@@ -41287,7 +41287,7 @@ static SDValue combineX86ShufflesRecursively(
   // shuffle with the larger type.
   return combineX86ShuffleChainWithExtract(
       Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask,
-      AllowVariablePerLaneMask, DAG, Subtarget);
+      AllowVariablePerLaneMask, IsMaskedShuffle, DAG, Subtarget);
 }
 
 /// Helper entry wrapper to combineX86ShufflesRecursively.
@@ -51064,6 +51064,31 @@ static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
+static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL,
+                                       SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget) {
+  using namespace llvm::SDPatternMatch;
+
+  EVT VT = And->getValueType(0);
+  // Make sure this node is a candidate for BMI instructions.
+  if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
+    return SDValue();
+
+  SDValue X;
+  SDValue Y;
+  if (!sd_match(And, m_And(m_OneUse(m_Xor(m_Value(X),
+                                          m_OneUse(m_Neg(m_Deferred(X))))),
+                           m_Value(Y))))
+    return SDValue();
+
+  SDValue BLSMSK =
+      DAG.getNode(ISD::XOR, DL, VT, X,
+                  DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
+  SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
+  return AndN;
+}
+
 static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag,
                                         SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI,
@@ -51472,6 +51497,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
     return R;
 
+  if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
+    return R;
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index e850841..9cc1598 100644
--- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -889,10 +889,20 @@ bool MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
   return false;
 }
 
+/// Returns true if \p F is either weak_odr or linkonce_odr.
+static bool isODR(const Function *F) {
+  return F->hasWeakODRLinkage() || F->hasLinkOnceODRLinkage();
+}
+
 // Merge two equivalent functions. Upon completion, Function G is deleted.
 void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
-  if (F->isInterposable()) {
-    assert(G->isInterposable());
+
+  // Create a new thunk that both F and G can call, if F cannot call G directly.
+  // That is the case if F is either interposable or if G is either weak_odr or
+  // linkonce_odr.
+  if (F->isInterposable() || (isODR(F) && isODR(G))) {
+    assert((!isODR(G) || isODR(F)) &&
+           "if G is ODR, F must also be ODR due to ordering");
 
     // Both writeThunkOrAlias() calls below must succeed, either because we can
     // create aliases for G and NewF, or because a thunk for F is profitable.
@@ -913,6 +923,13 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
     removeUsers(F);
     F->replaceAllUsesWith(NewF);
 
+    // If G or NewF are (weak|linkonce)_odr, update all callers to call the
+    // thunk.
+    if (isODR(G))
+      replaceDirectCallers(G, F);
+    if (isODR(F))
+      replaceDirectCallers(NewF, F);
+
     // We collect alignment before writeThunkOrAlias that overwrites NewF and
     // G's content.
     const MaybeAlign NewFAlign = NewF->getAlign();
@@ -986,16 +1003,24 @@ void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN,
 
 // Ordering for functions that are equal under FunctionComparator
 static bool isFuncOrderCorrect(const Function *F, const Function *G) {
+  if (isODR(F) != isODR(G)) {
+    // ODR functions before non-ODR functions. A ODR function can call a non-ODR
+    // function if it is not interposable, but not the other way around.
+    return isODR(G);
+  }
+
   if (F->isInterposable() != G->isInterposable()) {
     // Strong before weak, because the weak function may call the strong
     // one, but not the other way around.
     return !F->isInterposable();
   }
+
   if (F->hasLocalLinkage() != G->hasLocalLinkage()) {
     // External before local, because we definitely have to keep the external
     // function, but may be able to drop the local one.
     return !F->hasLocalLinkage();
   }
+
   // Impose a total order (by name) on the replacement of functions. This is
   // important when operating on more than one module independently to prevent
   // cycles of thunks calling each other when the modules are linked together.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 622884e..221511e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -996,7 +996,8 @@ Value *InstCombinerImpl::simplifyNonNullOperand(Value *V,
   if (!V->hasOneUse())
     return nullptr;
 
-  if (Depth == 1)
+  constexpr unsigned RecursionLimit = 3;
+  if (Depth == RecursionLimit)
     return nullptr;
 
   if (auto *GEP = dyn_cast<GetElementPtrInst>(V)) {
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 79f6858..267eb319 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1435,8 +1435,9 @@ static bool checkAndReplaceCondition(
     generateReproducer(Cmp, ReproducerModule, ReproducerCondStack, Info, DT);
     Constant *ConstantC = ConstantInt::getBool(
         CmpInst::makeCmpResultType(Cmp->getType()), IsTrue);
-    Cmp->replaceUsesWithIf(ConstantC, [&DT, NumIn, NumOut,
-                                       ContextInst](Use &U) {
+    bool Changed = false;
+    Cmp->replaceUsesWithIf(ConstantC, [&DT, NumIn, NumOut, ContextInst,
+                                       &Changed](Use &U) {
       auto *UserI = getContextInstForUse(U);
       auto *DTN = DT.getNode(UserI->getParent());
       if (!DTN || DTN->getDFSNumIn() < NumIn || DTN->getDFSNumOut() > NumOut)
@@ -1448,12 +1449,14 @@ static bool checkAndReplaceCondition(
       // Conditions in an assume trivially simplify to true. Skip uses
       // in assume calls to not destroy the available information.
       auto *II = dyn_cast<IntrinsicInst>(U.getUser());
-      return !II || II->getIntrinsicID() != Intrinsic::assume;
+      bool ShouldReplace = !II || II->getIntrinsicID() != Intrinsic::assume;
+      Changed |= ShouldReplace;
+      return ShouldReplace;
     });
     NumCondsRemoved++;
     if (Cmp->use_empty())
       ToRemove.push_back(Cmp);
-    return true;
+    return Changed;
   };
 
   if (auto ImpliedCondition =
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 3f306bb..c3aea60 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -332,19 +332,19 @@ struct llvm::gvn::AvailableValueInBlock {
 //===----------------------------------------------------------------------===//
 
 GVNPass::Expression GVNPass::ValueTable::createExpr(Instruction *I) {
-  Expression e;
-  e.type = I->getType();
-  e.opcode = I->getOpcode();
+  Expression E;
+  E.type = I->getType();
+  E.opcode = I->getOpcode();
   if (const GCRelocateInst *GCR = dyn_cast<GCRelocateInst>(I)) {
     // gc.relocate is 'special' call: its second and third operands are
     // not real values, but indices into statepoint's argument list.
     // Use the refered to values for purposes of identity.
-    e.varargs.push_back(lookupOrAdd(GCR->getOperand(0)));
-    e.varargs.push_back(lookupOrAdd(GCR->getBasePtr()));
-    e.varargs.push_back(lookupOrAdd(GCR->getDerivedPtr()));
+    E.varargs.push_back(lookupOrAdd(GCR->getOperand(0)));
+    E.varargs.push_back(lookupOrAdd(GCR->getBasePtr()));
+    E.varargs.push_back(lookupOrAdd(GCR->getDerivedPtr()));
   } else {
     for (Use &Op : I->operands())
-      e.varargs.push_back(lookupOrAdd(Op));
+      E.varargs.push_back(lookupOrAdd(Op));
   }
   if (I->isCommutative()) {
     // Ensure that commutative instructions that only differ by a permutation
@@ -352,78 +352,78 @@ GVNPass::Expression GVNPass::ValueTable::createExpr(Instruction *I) {
     // numbers.  Since commutative operands are the 1st two operands it is more
     // efficient to sort by hand rather than using, say, std::sort.
     assert(I->getNumOperands() >= 2 && "Unsupported commutative instruction!");
-    if (e.varargs[0] > e.varargs[1])
-      std::swap(e.varargs[0], e.varargs[1]);
-    e.commutative = true;
+    if (E.varargs[0] > E.varargs[1])
+      std::swap(E.varargs[0], E.varargs[1]);
+    E.commutative = true;
   }
 
   if (auto *C = dyn_cast<CmpInst>(I)) {
     // Sort the operand value numbers so x<y and y>x get the same value number.
     CmpInst::Predicate Predicate = C->getPredicate();
-    if (e.varargs[0] > e.varargs[1]) {
-      std::swap(e.varargs[0], e.varargs[1]);
+    if (E.varargs[0] > E.varargs[1]) {
+      std::swap(E.varargs[0], E.varargs[1]);
       Predicate = CmpInst::getSwappedPredicate(Predicate);
     }
-    e.opcode = (C->getOpcode() << 8) | Predicate;
-    e.commutative = true;
-  } else if (auto *E = dyn_cast<InsertValueInst>(I)) {
-    e.varargs.append(E->idx_begin(), E->idx_end());
+    E.opcode = (C->getOpcode() << 8) | Predicate;
+    E.commutative = true;
+  } else if (auto *IVI = dyn_cast<InsertValueInst>(I)) {
+    E.varargs.append(IVI->idx_begin(), IVI->idx_end());
   } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
     ArrayRef<int> ShuffleMask = SVI->getShuffleMask();
-    e.varargs.append(ShuffleMask.begin(), ShuffleMask.end());
+    E.varargs.append(ShuffleMask.begin(), ShuffleMask.end());
   } else if (auto *CB = dyn_cast<CallBase>(I)) {
-    e.attrs = CB->getAttributes();
+    E.attrs = CB->getAttributes();
   }
 
-  return e;
+  return E;
 }
 
 GVNPass::Expression GVNPass::ValueTable::createCmpExpr(
     unsigned Opcode, CmpInst::Predicate Predicate, Value *LHS, Value *RHS) {
   assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
          "Not a comparison!");
-  Expression e;
-  e.type = CmpInst::makeCmpResultType(LHS->getType());
-  e.varargs.push_back(lookupOrAdd(LHS));
-  e.varargs.push_back(lookupOrAdd(RHS));
+  Expression E;
+  E.type = CmpInst::makeCmpResultType(LHS->getType());
+  E.varargs.push_back(lookupOrAdd(LHS));
+  E.varargs.push_back(lookupOrAdd(RHS));
 
   // Sort the operand value numbers so x<y and y>x get the same value number.
-  if (e.varargs[0] > e.varargs[1]) {
-    std::swap(e.varargs[0], e.varargs[1]);
+  if (E.varargs[0] > E.varargs[1]) {
+    std::swap(E.varargs[0], E.varargs[1]);
     Predicate = CmpInst::getSwappedPredicate(Predicate);
   }
-  e.opcode = (Opcode << 8) | Predicate;
-  e.commutative = true;
-  return e;
+  E.opcode = (Opcode << 8) | Predicate;
+  E.commutative = true;
+  return E;
 }
 
 GVNPass::Expression
 GVNPass::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
   assert(EI && "Not an ExtractValueInst?");
-  Expression e;
-  e.type = EI->getType();
-  e.opcode = 0;
+  Expression E;
+  E.type = EI->getType();
+  E.opcode = 0;
 
   WithOverflowInst *WO = dyn_cast<WithOverflowInst>(EI->getAggregateOperand());
   if (WO != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
     // EI is an extract from one of our with.overflow intrinsics. Synthesize
     // a semantically equivalent expression instead of an extract value
     // expression.
-    e.opcode = WO->getBinaryOp();
-    e.varargs.push_back(lookupOrAdd(WO->getLHS()));
-    e.varargs.push_back(lookupOrAdd(WO->getRHS()));
-    return e;
+    E.opcode = WO->getBinaryOp();
+    E.varargs.push_back(lookupOrAdd(WO->getLHS()));
+    E.varargs.push_back(lookupOrAdd(WO->getRHS()));
+    return E;
   }
 
   // Not a recognised intrinsic. Fall back to producing an extract value
   // expression.
-  e.opcode = EI->getOpcode();
+  E.opcode = EI->getOpcode();
   for (Use &Op : EI->operands())
-    e.varargs.push_back(lookupOrAdd(Op));
+    E.varargs.push_back(lookupOrAdd(Op));
 
-  append_range(e.varargs, EI->indices());
+  append_range(E.varargs, EI->indices());
 
-  return e;
+  return E;
 }
 
 GVNPass::Expression GVNPass::ValueTable::createGEPExpr(GetElementPtrInst *GEP) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0bf2d71..226fc23 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9132,6 +9132,10 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
                          VPlan &Plan) {
   SetVector<VPIRInstruction *> ExitUsersToFix;
   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
+    // Nothing to do for unreachable exit blocks.
+    if (ExitVPBB->getNumPredecessors() == 0)
+      continue;
+
     for (VPRecipeBase &R : *ExitVPBB) {
       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
       if (!ExitIRI)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3d660b6..02fea2ea 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5014,6 +5014,42 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
   return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
 }
 
+/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
+/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
+/// instead of a scalar.
+static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI,
+                                                Type *ScalarTy, VectorType *Ty,
+                                                const APInt &DemandedElts,
+                                                bool Insert, bool Extract,
+                                                TTI::TargetCostKind CostKind,
+                                                ArrayRef<Value *> VL = {}) {
+  assert(!isa<ScalableVectorType>(Ty) &&
+         "ScalableVectorType is not supported.");
+  assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
+             getNumElements(Ty) &&
+         "Incorrect usage.");
+  if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
+    assert(SLPReVec && "Only supported by REVEC.");
+    // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
+    // of CreateInsertElement.
+    unsigned ScalarTyNumElements = VecTy->getNumElements();
+    InstructionCost Cost = 0;
+    for (unsigned I : seq(DemandedElts.getBitWidth())) {
+      if (!DemandedElts[I])
+        continue;
+      if (Insert)
+        Cost += getShuffleCost(TTI, TTI::SK_InsertSubvector, Ty, {}, CostKind,
+                               I * ScalarTyNumElements, VecTy);
+      if (Extract)
+        Cost += getShuffleCost(TTI, TTI::SK_ExtractSubvector, Ty, {}, CostKind,
+                               I * ScalarTyNumElements, VecTy);
+    }
+    return Cost;
+  }
+  return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+                                      CostKind, VL);
+}
+
 /// Correctly creates insert_subvector, checking that the index is multiple of
 /// the subvectors length. Otherwise, generates shuffle using \p Generator or
 /// using default shuffle.
@@ -5207,22 +5243,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                     Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
     // Estimate the cost of masked gather GEP. If not a splat, roughly
     // estimate as a buildvector, otherwise estimate as splat.
-    APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
-    VectorType *PtrVecTy =
-        getWidenedType(PointerOps.front()->getType()->getScalarType(),
-                       VecTy->getNumElements());
+    APInt DemandedElts = APInt::getAllOnes(Sz);
+    Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
+    VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
     if (static_cast<unsigned>(count_if(
             PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
         any_of(PointerOps, [&](Value *V) {
           return getUnderlyingObject(V) !=
                  getUnderlyingObject(PointerOps.front());
         }))
-      VectorGEPCost += TTI.getScalarizationOverhead(
-          PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
+      VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
+                                                DemandedElts, /*Insert=*/true,
+                                                /*Extract=*/false, CostKind);
     else
       VectorGEPCost +=
-          TTI.getScalarizationOverhead(
-              PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
+          getScalarizationOverhead(
+              TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
               /*Insert=*/true, /*Extract=*/false, CostKind) +
           ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
     // The cost of scalar loads.
@@ -5240,8 +5276,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
             /*VariableMask=*/false, CommonAlignment, CostKind) +
         (ProfitableGatherPointers ? 0 : VectorGEPCost);
     InstructionCost GatherCost =
-        TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
-                                     /*Extract=*/false, CostKind) +
+        getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+                                 /*Insert=*/true,
+                                 /*Extract=*/false, CostKind) +
         ScalarLoadsCost;
     // The list of loads is small or perform partial check already - directly
     // compare masked gather cost and gather cost.
@@ -5294,16 +5331,15 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       // Can be vectorized later as a serie of loads/insertelements.
       InstructionCost VecLdCost = 0;
       if (!DemandedElts.isZero()) {
-        VecLdCost =
-            TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
-                                         /*Extract=*/false, CostKind) +
-            ScalarGEPCost;
+        VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+                                             /*Insert=*/true,
+                                             /*Extract=*/false, CostKind) +
+                    ScalarGEPCost;
         for (unsigned Idx : seq<unsigned>(VL.size()))
           if (DemandedElts[Idx])
             VecLdCost +=
                 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
       }
-      unsigned ScalarTyNumElements = getNumElements(ScalarTy);
       auto *SubVecTy = getWidenedType(ScalarTy, VF);
       for (auto [I, LS] : enumerate(States)) {
         auto *LI0 = cast<LoadInst>(VL[I * VF]);
@@ -5323,13 +5359,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                 return getUnderlyingObject(V) !=
                        getUnderlyingObject(PointerOps.front());
               }))
-            VectorGEPCost += TTI.getScalarizationOverhead(
-                SubVecTy, APInt::getAllOnes(VF),
+            VectorGEPCost += getScalarizationOverhead(
+                TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
                 /*Insert=*/true, /*Extract=*/false, CostKind);
           else
             VectorGEPCost +=
-                TTI.getScalarizationOverhead(
-                    SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
+                getScalarizationOverhead(
+                    TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
                     /*Insert=*/true, /*Extract=*/false, CostKind) +
                 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
                                  CostKind);
@@ -9912,20 +9948,9 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
     Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
                              Idx, getWidenedType(ScalarTy, Sz));
   }
-  if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
-    assert(SLPReVec && "Only supported by REVEC.");
-    // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
-    // of CreateInsertElement.
-    unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-    for (unsigned I : seq<unsigned>(TE.Scalars.size()))
-      if (DemandedElts[I])
-        Cost +=
-            TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
-                                CostKind, I * ScalarTyNumElements, FTy);
-  } else {
-    Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
-                                          /*Extract=*/false, CostKind);
-  }
+  Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
+                                   /*Insert=*/true,
+                                   /*Extract=*/false, CostKind);
   int Sz = TE.Scalars.size();
   SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
                                TE.ReorderIndices.end());
@@ -9942,7 +9967,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
                                ? TTI::SK_PermuteTwoSrc
                                : TTI::SK_PermuteSingleSrc,
                            VecTy, ReorderMask);
-  DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
+  DemandedElts = APInt::getAllOnes(TE.Scalars.size());
   ReorderMask.assign(Sz, PoisonMaskElem);
   for (unsigned I : seq<unsigned>(Sz)) {
     Value *V = TE.getOrdered(I);
@@ -9954,8 +9979,9 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
       ReorderMask[I] = I + Sz;
     }
   }
-  InstructionCost BVCost = TTI->getScalarizationOverhead(
-      VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
+  InstructionCost BVCost =
+      getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
+                               /*Insert=*/true, /*Extract=*/false, CostKind);
   if (!DemandedElts.isAllOnes())
     BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
   if (Cost >= BVCost) {
@@ -11603,9 +11629,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     assert(Offset < NumElts && "Failed to find vector index offset");
 
     InstructionCost Cost = 0;
-    Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
-                                          /*Insert*/ true, /*Extract*/ false,
-                                          CostKind);
+    Cost -=
+        getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
+                                 /*Insert*/ true, /*Extract*/ false, CostKind);
 
     // First cost - resize to actual vector size if not identity shuffle or
     // need to shift the vector.
@@ -13780,8 +13806,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       }
       if (!IsIdentity)
         FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
-      FirstShuffleCost += TTI->getScalarizationOverhead(
-          MaskVecTy, DemandedElts, /*Insert=*/true,
+      FirstShuffleCost += getScalarizationOverhead(
+          *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
           /*Extract=*/false, CostKind);
     }
     InstructionCost SecondShuffleCost = 0;
@@ -13805,17 +13831,17 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       }
       if (!IsIdentity)
         SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
-      SecondShuffleCost += TTI->getScalarizationOverhead(
-          MaskVecTy, DemandedElts, /*Insert=*/true,
+      SecondShuffleCost += getScalarizationOverhead(
+          *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
           /*Extract=*/false, CostKind);
     }
     APInt DemandedElts = APInt::getAllOnes(SubMask.size());
     for (auto [I, Idx] : enumerate(SubMask))
       if (Idx == PoisonMaskElem)
         DemandedElts.clearBit(I);
-    InstructionCost BuildVectorCost =
-        TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
-                                      /*Extract=*/false, CostKind);
+    InstructionCost BuildVectorCost = getScalarizationOverhead(
+        *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
+        /*Extract=*/false, CostKind);
     const TreeEntry *BestEntry = nullptr;
     if (FirstShuffleCost < ShuffleCost) {
       std::for_each(std::next(Mask.begin(), Part * VL.size()),
@@ -13968,45 +13994,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
     ShuffledElements.setBit(I);
     ShuffleMask[I] = Res.first->second;
   }
-  if (!DemandedElements.isZero()) {
-    if (isa<FixedVectorType>(ScalarTy)) {
-      assert(SLPReVec && "Only supported by REVEC.");
-      // We don't need to insert elements one by one. Instead, we can insert the
-      // entire vector into the destination.
-      Cost = 0;
-      unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-      for (unsigned I : seq<unsigned>(VL.size()))
-        if (DemandedElements[I])
-          Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {},
-                                   CostKind, I * ScalarTyNumElements,
-                                   cast<FixedVectorType>(ScalarTy));
-    } else {
-      Cost += TTI->getScalarizationOverhead(VecTy, DemandedElements,
-                                            /*Insert=*/true,
-                                            /*Extract=*/false, CostKind, VL);
-    }
-  }
-  if (ForPoisonSrc) {
-    if (isa<FixedVectorType>(ScalarTy)) {
-      assert(SLPReVec && "Only supported by REVEC.");
-      // We don't need to insert elements one by one. Instead, we can insert the
-      // entire vector into the destination.
-      assert(DemandedElements.isZero() &&
-             "Need to consider the cost from DemandedElements.");
-      Cost = 0;
-      unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-      for (unsigned I : seq<unsigned>(VL.size()))
-        if (!ShuffledElements[I])
-          Cost += TTI->getShuffleCost(
-              TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
-              I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
-    } else {
-      Cost = TTI->getScalarizationOverhead(VecTy,
-                                           /*DemandedElts*/ ~ShuffledElements,
-                                           /*Insert*/ true,
-                                           /*Extract*/ false, CostKind, VL);
-    }
-  }
+  if (!DemandedElements.isZero())
+    Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
+                                     /*Insert=*/true,
+                                     /*Extract=*/false, CostKind, VL);
+  if (ForPoisonSrc)
+    Cost = getScalarizationOverhead(*TTI, ScalarTy, VecTy,
+                                    /*DemandedElts*/ ~ShuffledElements,
+                                    /*Insert*/ true,
+                                    /*Extract*/ false, CostKind, VL);
   if (DuplicateNonConst)
     Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
                              VecTy, ShuffleMask);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 121678b..d3c195d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -850,6 +850,11 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
 VPlan::VPlan(Loop *L) {
   setEntry(createVPIRBasicBlock(L->getLoopPreheader()));
   ScalarHeader = createVPIRBasicBlock(L->getHeader());
+
+  SmallVector<BasicBlock *> IRExitBlocks;
+  L->getExitBlocks(IRExitBlocks);
+  for (BasicBlock *EB : IRExitBlocks)
+    ExitBlocks.push_back(createVPIRBasicBlock(EB));
 }
 
 VPlan::~VPlan() {
@@ -931,7 +936,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
-  auto *VPExitBlock = Plan->createVPIRBasicBlock(IRExitBlock);
+  VPIRBasicBlock *VPExitBlock = Plan->getExitBlock(IRExitBlock);
   // The connection order corresponds to the operands of the conditional branch.
   VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
   VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -983,6 +988,14 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   }
 }
 
+VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const {
+  auto Iter = find_if(getExitBlocks(), [IRBB](const VPIRBasicBlock *VPIRBB) {
+    return VPIRBB->getIRBasicBlock() == IRBB;
+  });
+  assert(Iter != getExitBlocks().end() && "no exit block found");
+  return *Iter;
+}
+
 bool VPlan::isExitBlock(VPBlockBase *VPBB) {
   return isa<VPIRBasicBlock>(VPBB) && VPBB->getNumSuccessors() == 0;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d86914f..1f1af7f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3426,6 +3426,11 @@ class VPlan {
   /// VPIRBasicBlock wrapping the header of the original scalar loop.
   VPIRBasicBlock *ScalarHeader;
 
+  /// Immutable list of VPIRBasicBlocks wrapping the exit blocks of the original
+  /// scalar loop. Note that some exit blocks may be unreachable at the moment,
+  /// e.g. if the scalar epilogue always executes.
+  SmallVector<VPIRBasicBlock *, 2> ExitBlocks;
+
   /// Holds the VFs applicable to this VPlan.
   SmallSetVector<ElementCount, 2> VFs;
 
@@ -3559,11 +3564,13 @@ public:
   /// Return the VPIRBasicBlock wrapping the header of the scalar loop.
   VPIRBasicBlock *getScalarHeader() const { return ScalarHeader; }
 
-  /// Return an iterator range over the VPIRBasicBlock wrapping the exit blocks
-  /// of the VPlan, that is leaf nodes except the scalar header. Defined in
-  /// VPlanHCFG, as the definition of the type needs access to the definitions
-  /// of VPBlockShallowTraversalWrapper.
-  auto getExitBlocks();
+  /// Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of
+  /// the original scalar loop.
+  ArrayRef<VPIRBasicBlock *> getExitBlocks() const { return ExitBlocks; }
+
+  /// Return the VPIRBasicBlock corresponding to \p IRBB. \p IRBB must be an
+  /// exit block.
+  VPIRBasicBlock *getExitBlock(BasicBlock *IRBB) const;
 
   /// Returns true if \p VPBB is an exit block.
   bool isExitBlock(VPBlockBase *VPBB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
index 8fbdacd..a1014c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
@@ -307,15 +307,6 @@ template <> struct GraphTraits<VPlan *> {
   }
 };
 
-inline auto VPlan::getExitBlocks() {
-  VPBlockBase *ScalarHeader = getScalarHeader();
-  return make_filter_range(
-      VPBlockUtils::blocksOnly<VPIRBasicBlock>(
-          vp_depth_first_shallow(getVectorLoopRegion()->getSingleSuccessor())),
-      [ScalarHeader](VPIRBasicBlock *VPIRBB) {
-        return VPIRBB != ScalarHeader && VPIRBB->getNumSuccessors() == 0;
-      });
-}
 } // namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLANCFG_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8e77327..13ef302 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2049,18 +2049,9 @@ void VPlanTransforms::handleUncountableEarlyExit(
       cast<BranchInst>(UncountableExitingBlock->getTerminator());
   BasicBlock *TrueSucc = EarlyExitingBranch->getSuccessor(0);
   BasicBlock *FalseSucc = EarlyExitingBranch->getSuccessor(1);
-
-  // The early exit block may or may not be the same as the "countable" exit
-  // block. Creates a new VPIRBB for the early exit block in case it is distinct
-  // from the countable exit block.
-  // TODO: Introduce both exit blocks during VPlan skeleton construction.
-  VPIRBasicBlock *VPEarlyExitBlock;
-  if (OrigLoop->getUniqueExitBlock()) {
-    VPEarlyExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
-  } else {
-    VPEarlyExitBlock = Plan.createVPIRBasicBlock(
-        !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
-  }
+  BasicBlock *EarlyExitIRBB =
+      !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc;
+  VPIRBasicBlock *VPEarlyExitBlock = Plan.getExitBlock(EarlyExitIRBB);
 
   VPValue *EarlyExitNotTakenCond = RecipeBuilder.getBlockInMask(
       OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
diff --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll
index ef52d0d..50ed702 100644
--- a/llvm/test/Analysis/CostModel/AArch64/div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/div.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
-define i32 @sdiv() {
+define void @sdiv() {
 ; CHECK-LABEL: 'sdiv'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, undef
@@ -28,7 +28,7 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = sdiv i128 undef, undef
 
@@ -58,10 +58,10 @@ define i32 @sdiv() {
   %V32i8 = sdiv <32 x i8> undef, undef
   %V64i8 = sdiv <64 x i8> undef, undef
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @udiv() {
+define void @udiv() {
 ; CHECK-LABEL: 'udiv'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, undef
@@ -86,7 +86,7 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = udiv i128 undef, undef
 
@@ -116,10 +116,176 @@ define i32 @udiv() {
   %V32i8 = udiv <32 x i8> undef, undef
   %V64i8 = udiv <64 x i8> undef, undef
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @sdiv_const() {
+define void @sdiv_uniform() {
+; CHECK-LABEL: 'sdiv_uniform'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, %V2i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, %V4i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, %V8i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, %V2i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, %V4i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, %V8i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, %V16i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, %V2i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16_s = shufflevector <4 x i16> poison, <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, %V4i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, %V8i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, %V16i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, %V32i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, %V2i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8_s = shufflevector <4 x i8> poison, <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, %V4i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8_s = shufflevector <8 x i8> poison, <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, %V8i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, %V16i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, %V32i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, %V64i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
+  %V2i64 = sdiv <2 x i64> undef, %V2i64_s
+  %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer
+  %V4i64 = sdiv <4 x i64> undef, %V4i64_s
+  %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer
+  %V8i64 = sdiv <8 x i64> undef, %V8i64_s
+
+  %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer
+  %V2i32 = sdiv <2 x i32> undef, %V2i32_s
+  %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer
+  %V4i32 = sdiv <4 x i32> undef, %V4i32_s
+  %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer
+  %V8i32 = sdiv <8 x i32> undef, %V8i32_s
+  %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer
+  %V16i32 = sdiv <16 x i32> undef, %V16i32_s
+
+  %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer
+  %V2i16 = sdiv <2 x i16> undef, %V2i16_s
+  %V4i16_s = shufflevector <4 x i16> poison, <4 x i16> poison, <4 x i32> zeroinitializer
+  %V4i16 = sdiv <4 x i16> undef, %V4i16_s
+  %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer
+  %V8i16 = sdiv <8 x i16> undef, %V8i16_s
+  %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer
+  %V16i16 = sdiv <16 x i16> undef, %V16i16_s
+  %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer
+  %V32i16 = sdiv <32 x i16> undef, %V32i16_s
+
+  %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer
+  %V2i8 = sdiv <2 x i8> undef, %V2i8_s
+  %V4i8_s = shufflevector <4 x i8> poison, <4 x i8> poison, <4 x i32> zeroinitializer
+  %V4i8 = sdiv <4 x i8> undef, %V4i8_s
+  %V8i8_s = shufflevector <8 x i8> poison, <8 x i8> poison, <8 x i32> zeroinitializer
+  %V8i8 = sdiv <8 x i8> undef, %V8i8_s
+  %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer
+  %V16i8 = sdiv <16 x i8> undef, %V16i8_s
+  %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer
+  %V32i8 = sdiv <32 x i8> undef, %V32i8_s
+  %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer
+  %V64i8 = sdiv <64 x i8> undef, %V64i8_s
+
+  ret void
+}
+
+define void @udiv_uniform() {
+; CHECK-LABEL: 'udiv_uniform'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, %V2i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, %V4i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, %V8i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, %V2i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, %V4i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, %V8i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, %V16i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, %V2i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16_s = shufflevector <4 x i16> poison, <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, %V4i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, %V8i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, %V16i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, %V32i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, %V2i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8_s = shufflevector <4 x i8> poison, <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, %V4i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8_s = shufflevector <8 x i8> poison, <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, %V8i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, %V16i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, %V32i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, %V64i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
+  %V2i64 = udiv <2 x i64> undef, %V2i64_s
+  %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer
+  %V4i64 = udiv <4 x i64> undef, %V4i64_s
+  %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer
+  %V8i64 = udiv <8 x i64> undef, %V8i64_s
+
+  %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer
+  %V2i32 = udiv <2 x i32> undef, %V2i32_s
+  %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer
+  %V4i32 = udiv <4 x i32> undef, %V4i32_s
+  %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer
+  %V8i32 = udiv <8 x i32> undef, %V8i32_s
+  %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer
+  %V16i32 = udiv <16 x i32> undef, %V16i32_s
+
+  %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer
+  %V2i16 = udiv <2 x i16> undef, %V2i16_s
+  %V4i16_s = shufflevector <4 x i16> poison, <4 x i16> poison, <4 x i32> zeroinitializer
+  %V4i16 = udiv <4 x i16> undef, %V4i16_s
+  %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer
+  %V8i16 = udiv <8 x i16> undef, %V8i16_s
+  %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer
+  %V16i16 = udiv <16 x i16> undef, %V16i16_s
+  %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer
+  %V32i16 = udiv <32 x i16> undef, %V32i16_s
+
+  %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer
+  %V2i8 = udiv <2 x i8> undef, %V2i8_s
+  %V4i8_s = shufflevector <4 x i8> poison, <4 x i8> poison, <4 x i32> zeroinitializer
+  %V4i8 = udiv <4 x i8> undef, %V4i8_s
+  %V8i8_s = shufflevector <8 x i8> poison, <8 x i8> poison, <8 x i32> zeroinitializer
+  %V8i8 = udiv <8 x i8> undef, %V8i8_s
+  %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer
+  %V16i8 = udiv <16 x i8> undef, %V16i8_s
+  %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer
+  %V32i8 = udiv <32 x i8> undef, %V32i8_s
+  %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer
+  %V64i8 = udiv <64 x i8> undef, %V64i8_s
+
+  ret void
+}
+
+define void @sdiv_const() {
 ; CHECK-LABEL: 'sdiv_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, 7
@@ -144,7 +310,7 @@ define i32 @sdiv_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = sdiv i128 undef, 7
 
@@ -174,10 +340,10 @@ define i32 @sdiv_const() {
   %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @udiv_const() {
+define void @udiv_const() {
 ; CHECK-LABEL: 'udiv_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 7
@@ -202,7 +368,7 @@ define i32 @udiv_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
   %I128 = udiv i128 undef, 7
@@ -233,10 +399,10 @@ define i32 @udiv_const() {
   %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @sdiv_uniformconst() {
+define void @sdiv_uniformconst() {
 ; CHECK-LABEL: 'sdiv_uniformconst'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, 7
@@ -261,7 +427,7 @@ define i32 @sdiv_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 7)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = sdiv i128 undef, 7
 
@@ -291,10 +457,10 @@ define i32 @sdiv_uniformconst() {
   %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @udiv_uniformconst() {
+define void @udiv_uniformconst() {
 ; CHECK-LABEL: 'udiv_uniformconst'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 7
@@ -319,7 +485,7 @@ define i32 @udiv_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 7)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = udiv i128 undef, 7
 
@@ -349,10 +515,10 @@ define i32 @udiv_uniformconst() {
   %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @sdiv_constpow2() {
+define void @sdiv_constpow2() {
 ; CHECK-LABEL: 'sdiv_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16
@@ -377,7 +543,7 @@ define i32 @sdiv_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = sdiv i128 undef, 16
 
@@ -407,10 +573,10 @@ define i32 @sdiv_constpow2() {
   %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @udiv_constpow2() {
+define void @udiv_constpow2() {
 ; CHECK-LABEL: 'udiv_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 16
@@ -435,7 +601,7 @@ define i32 @udiv_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = udiv i128 undef, 16
 
@@ -465,10 +631,10 @@ define i32 @udiv_constpow2() {
   %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @sdiv_uniformconstpow2() {
+define void @sdiv_uniformconstpow2() {
 ; CHECK-LABEL: 'sdiv_uniformconstpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16
@@ -493,7 +659,7 @@ define i32 @sdiv_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 16)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = sdiv i128 undef, 16
 
@@ -523,10 +689,10 @@ define i32 @sdiv_uniformconstpow2() {
   %V32i8 = sdiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = sdiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @udiv_uniformconstpow2() {
+define void @udiv_uniformconstpow2() {
 ; CHECK-LABEL: 'udiv_uniformconstpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 16
@@ -551,7 +717,7 @@ define i32 @udiv_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 16)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = udiv i128 undef, 16
 
@@ -581,10 +747,10 @@ define i32 @udiv_uniformconstpow2() {
   %V32i8 = udiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = udiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @sdiv_constnegpow2() {
+define void @sdiv_constnegpow2() {
 ; CHECK-LABEL: 'sdiv_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, -16
@@ -609,7 +775,7 @@ define i32 @sdiv_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = sdiv i128 undef, -16
 
@@ -639,10 +805,10 @@ define i32 @sdiv_constnegpow2() {
   %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @udiv_constnegpow2() {
+define void @udiv_constnegpow2() {
 ; CHECK-LABEL: 'udiv_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, -16
@@ -667,7 +833,7 @@ define i32 @udiv_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = udiv i128 undef, -16
 
@@ -697,10 +863,10 @@ define i32 @udiv_constnegpow2() {
   %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @sdiv_uniformconstnegpow2() {
+define void @sdiv_uniformconstnegpow2() {
 ; CHECK-LABEL: 'sdiv_uniformconstnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = sdiv i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, -16
@@ -725,7 +891,7 @@ define i32 @sdiv_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = sdiv i128 undef, -16
 
@@ -755,10 +921,10 @@ define i32 @sdiv_uniformconstnegpow2() {
   %V32i8 = sdiv <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = sdiv <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @udiv_uniformconstnegpow2() {
+define void @udiv_uniformconstnegpow2() {
 ; CHECK-LABEL: 'udiv_uniformconstnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I128 = udiv i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, -16
@@ -783,7 +949,7 @@ define i32 @udiv_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = udiv i128 undef, -16
 
@@ -813,5 +979,5 @@ define i32 @udiv_uniformconstnegpow2() {
   %V32i8 = udiv <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = udiv <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/rem.ll b/llvm/test/Analysis/CostModel/AArch64/rem.ll
index 06c05ae..ef40c9d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/rem.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/rem.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
-define i32 @srem() {
+define void @srem() {
 ; CHECK-LABEL: 'srem'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, undef
@@ -28,7 +28,7 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = srem i128 undef, undef
 
@@ -58,10 +58,10 @@ define i32 @srem() {
   %V32i8 = srem <32 x i8> undef, undef
   %V64i8 = srem <64 x i8> undef, undef
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @urem() {
+define void @urem() {
 ; CHECK-LABEL: 'urem'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, undef
@@ -86,7 +86,7 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = urem i128 undef, undef
 
@@ -116,10 +116,176 @@ define i32 @urem() {
   %V32i8 = urem <32 x i8> undef, undef
   %V64i8 = urem <64 x i8> undef, undef
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @srem_const() {
+define void @srem_uniform() {
+; CHECK-LABEL: 'srem_uniform'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, %V2i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, %V4i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, %V8i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, %V2i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, %V4i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, %V8i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, %V16i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, %V2i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16_s = shufflevector <4 x i16> poison, <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, %V4i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, %V8i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, %V16i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, %V32i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, %V2i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8_s = shufflevector <4 x i8> poison, <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, %V4i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8_s = shufflevector <8 x i8> poison, <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, %V8i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, %V16i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, %V32i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, %V64i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
+  %V2i64 = srem <2 x i64> undef, %V2i64_s
+  %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer
+  %V4i64 = srem <4 x i64> undef, %V4i64_s
+  %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer
+  %V8i64 = srem <8 x i64> undef, %V8i64_s
+
+  %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer
+  %V2i32 = srem <2 x i32> undef, %V2i32_s
+  %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer
+  %V4i32 = srem <4 x i32> undef, %V4i32_s
+  %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer
+  %V8i32 = srem <8 x i32> undef, %V8i32_s
+  %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer
+  %V16i32 = srem <16 x i32> undef, %V16i32_s
+
+  %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer
+  %V2i16 = srem <2 x i16> undef, %V2i16_s
+  %V4i16_s = shufflevector <4 x i16> poison, <4 x i16> poison, <4 x i32> zeroinitializer
+  %V4i16 = srem <4 x i16> undef, %V4i16_s
+  %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer
+  %V8i16 = srem <8 x i16> undef, %V8i16_s
+  %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer
+  %V16i16 = srem <16 x i16> undef, %V16i16_s
+  %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer
+  %V32i16 = srem <32 x i16> undef, %V32i16_s
+
+  %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer
+  %V2i8 = srem <2 x i8> undef, %V2i8_s
+  %V4i8_s = shufflevector <4 x i8> poison, <4 x i8> poison, <4 x i32> zeroinitializer
+  %V4i8 = srem <4 x i8> undef, %V4i8_s
+  %V8i8_s = shufflevector <8 x i8> poison, <8 x i8> poison, <8 x i32> zeroinitializer
+  %V8i8 = srem <8 x i8> undef, %V8i8_s
+  %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer
+  %V16i8 = srem <16 x i8> undef, %V16i8_s
+  %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer
+  %V32i8 = srem <32 x i8> undef, %V32i8_s
+  %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer
+  %V64i8 = srem <64 x i8> undef, %V64i8_s
+
+  ret void
+}
+
+define void @urem_uniform() {
+; CHECK-LABEL: 'urem_uniform'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, %V2i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, %V4i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, %V8i64_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, %V2i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, %V4i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, %V8i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, %V16i32_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, %V2i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16_s = shufflevector <4 x i16> poison, <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, %V4i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, %V8i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, %V16i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, %V32i16_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, %V2i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8_s = shufflevector <4 x i8> poison, <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, %V4i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8_s = shufflevector <8 x i8> poison, <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, %V8i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, %V16i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, %V32i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, %V64i8_s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer
+  %V2i64 = urem <2 x i64> undef, %V2i64_s
+  %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer
+  %V4i64 = urem <4 x i64> undef, %V4i64_s
+  %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer
+  %V8i64 = urem <8 x i64> undef, %V8i64_s
+
+  %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer
+  %V2i32 = urem <2 x i32> undef, %V2i32_s
+  %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer
+  %V4i32 = urem <4 x i32> undef, %V4i32_s
+  %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer
+  %V8i32 = urem <8 x i32> undef, %V8i32_s
+  %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer
+  %V16i32 = urem <16 x i32> undef, %V16i32_s
+
+  %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer
+  %V2i16 = urem <2 x i16> undef, %V2i16_s
+  %V4i16_s = shufflevector <4 x i16> poison, <4 x i16> poison, <4 x i32> zeroinitializer
+  %V4i16 = urem <4 x i16> undef, %V4i16_s
+  %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer
+  %V8i16 = urem <8 x i16> undef, %V8i16_s
+  %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer
+  %V16i16 = urem <16 x i16> undef, %V16i16_s
+  %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer
+  %V32i16 = urem <32 x i16> undef, %V32i16_s
+
+  %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer
+  %V2i8 = urem <2 x i8> undef, %V2i8_s
+  %V4i8_s = shufflevector <4 x i8> poison, <4 x i8> poison, <4 x i32> zeroinitializer
+  %V4i8 = urem <4 x i8> undef, %V4i8_s
+  %V8i8_s = shufflevector <8 x i8> poison, <8 x i8> poison, <8 x i32> zeroinitializer
+  %V8i8 = urem <8 x i8> undef, %V8i8_s
+  %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer
+  %V16i8 = urem <16 x i8> undef, %V16i8_s
+  %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer
+  %V32i8 = urem <32 x i8> undef, %V32i8_s
+  %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer
+  %V64i8 = urem <64 x i8> undef, %V64i8_s
+
+  ret void
+}
+
+define void @srem_const() {
 ; CHECK-LABEL: 'srem_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
@@ -144,7 +310,7 @@ define i32 @srem_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = srem i128 undef, 7
 
@@ -174,10 +340,10 @@ define i32 @srem_const() {
   %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @urem_const() {
+define void @urem_const() {
 ; CHECK-LABEL: 'urem_const'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
@@ -202,7 +368,7 @@ define i32 @urem_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
   %I128 = urem i128 undef, 7
@@ -233,10 +399,10 @@ define i32 @urem_const() {
   %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @srem_uniformconst() {
+define void @srem_uniformconst() {
 ; CHECK-LABEL: 'srem_uniformconst'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
@@ -261,7 +427,7 @@ define i32 @srem_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 7)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = srem i128 undef, 7
 
@@ -291,10 +457,10 @@ define i32 @srem_uniformconst() {
   %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @urem_uniformconst() {
+define void @urem_uniformconst() {
 ; CHECK-LABEL: 'urem_uniformconst'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
@@ -319,7 +485,7 @@ define i32 @urem_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 7)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = urem i128 undef, 7
 
@@ -349,10 +515,10 @@ define i32 @urem_uniformconst() {
   %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @srem_constpow2() {
+define void @srem_constpow2() {
 ; CHECK-LABEL: 'srem_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
@@ -377,7 +543,7 @@ define i32 @srem_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = srem i128 undef, 16
 
@@ -407,10 +573,10 @@ define i32 @srem_constpow2() {
   %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @urem_constpow2() {
+define void @urem_constpow2() {
 ; CHECK-LABEL: 'urem_constpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
@@ -435,7 +601,7 @@ define i32 @urem_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = urem i128 undef, 16
 
@@ -465,10 +631,10 @@ define i32 @urem_constpow2() {
   %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @srem_uniformconstpow2() {
+define void @srem_uniformconstpow2() {
 ; CHECK-LABEL: 'srem_uniformconstpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
@@ -493,7 +659,7 @@ define i32 @srem_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 16)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = srem i128 undef, 16
 
@@ -523,10 +689,10 @@ define i32 @srem_uniformconstpow2() {
   %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @urem_uniformconstpow2() {
+define void @urem_uniformconstpow2() {
 ; CHECK-LABEL: 'urem_uniformconstpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
@@ -551,7 +717,7 @@ define i32 @urem_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 16)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = urem i128 undef, 16
 
@@ -581,10 +747,10 @@ define i32 @urem_uniformconstpow2() {
   %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @srem_constnegpow2() {
+define void @srem_constnegpow2() {
 ; CHECK-LABEL: 'srem_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
@@ -609,7 +775,7 @@ define i32 @srem_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = srem i128 undef, -16
 
@@ -639,10 +805,10 @@ define i32 @srem_constnegpow2() {
   %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @urem_constnegpow2() {
+define void @urem_constnegpow2() {
 ; CHECK-LABEL: 'urem_constnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
@@ -667,7 +833,7 @@ define i32 @urem_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = urem i128 undef, -16
 
@@ -697,10 +863,10 @@ define i32 @urem_constnegpow2() {
   %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @srem_uniformconstnegpow2() {
+define void @srem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'srem_uniformconstnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
@@ -725,7 +891,7 @@ define i32 @srem_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = srem i128 undef, -16
 
@@ -755,10 +921,10 @@ define i32 @srem_uniformconstnegpow2() {
   %V32i8 = srem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = srem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 
-  ret i32 undef
+  ret void
 }
 
-define i32 @urem_uniformconstnegpow2() {
+define void @urem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'urem_uniformconstnegpow2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
@@ -783,7 +949,7 @@ define i32 @urem_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I128 = urem i128 undef, -16
 
@@ -813,5 +979,5 @@ define i32 @urem_uniformconstnegpow2() {
   %V32i8 = urem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = urem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-reverse.ll
index 807685c..7a11d6f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-reverse.ll
@@ -77,11 +77,11 @@ define void @reverse() {
 
 define void @vrev64() {
 ; CHECK-LABEL: 'vrev64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -90,10 +90,10 @@ define void @vrev64() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -140,18 +140,18 @@ define void @vrev64() {
 
 define void @vrev32() {
 ; CHECK-LABEL: 'vrev32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64 = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -187,12 +187,12 @@ define void @vrev32() {
 
 define void @vrev16() {
 ; CHECK-LABEL: 'vrev16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64 = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
index ab7151c..a416124 100644
--- a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
+++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
@@ -106,6 +106,30 @@ define void @test_typedbuffer() {
   ; CHECK:   Element Type: f32
   ; CHECK:   Element Count: 4
 
+  %cb0 = call target("dx.CBuffer", {float})
+      @llvm.dx.resource.handlefrombinding(i32 1, i32 0, i32 1, i32 0, i1 false)
+  ; CHECK: Binding [[CB0:[0-9]+]]:
+  ; CHECK:   Binding:
+  ; CHECK:     Record ID: 0
+  ; CHECK:     Space: 1
+  ; CHECK:     Lower Bound: 0
+  ; CHECK:     Size: 1
+  ; CHECK:   Class: CBuffer
+  ; CHECK:   Kind: CBuffer
+  ; CHECK:   CBuffer size: 4
+
+  %cb1 = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+      @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, i1 false)
+  ; CHECK: Binding [[CB1:[0-9]+]]:
+  ; CHECK:   Binding:
+  ; CHECK:     Record ID: 1
+  ; CHECK:     Space: 1
+  ; CHECK:     Lower Bound: 8
+  ; CHECK:     Size: 1
+  ; CHECK:   Class: CBuffer
+  ; CHECK:   Kind: CBuffer
+  ; CHECK:   CBuffer size: 4
+
   ; CHECK-NOT: Binding {{[0-9]+}}:
 
   ret void
@@ -118,5 +142,7 @@ define void @test_typedbuffer() {
 ; CHECK-DAG: Call bound to [[UAV1]]: %uav1 =
 ; CHECK-DAG: Call bound to [[UAV2]]: %uav2_1 =
 ; CHECK-DAG: Call bound to [[UAV2]]: %uav2_2 =
+; CHECK-DAG: Call bound to [[CB0]]: %cb0 =
+; CHECK-DAG: Call bound to [[CB1]]: %cb1 =
 
 attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
index 6b12d95..43d49da 100644
--- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
+++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
@@ -13,25 +13,26 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 define i64 @bfis_in_loop_zero() {
 ; CHECK-LABEL: bfis_in_loop_zero:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT: 	adrp	x8, :got:global
-; CHECK-NEXT: 	mov	x0, xzr
-; CHECK-NEXT: 	mov	w9, wzr
-; CHECK-NEXT: 	ldr	x8, [x8, :got_lo12:global]
-; CHECK-NEXT: 	ldr	x8, [x8]
-; CHECK-NEXT: .LBB0_1:                                // %midblock
-; CHECK-NEXT:   // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:	ldrh	w10, [x8, #72]
-; CHECK-NEXT:	ldr	x13, [x8, #8]
-; CHECK-NEXT:	lsr	w11, w10, #8
-; CHECK-NEXT:	cmp	w10, #0
-; CHECK-NEXT:	ldr	x8, [x13, #16]
-; CHECK-NEXT:	cset	w12, ne
-; CHECK-NEXT:	csel	w9, w9, w11, eq
-; CHECK-NEXT:	and	x11, x0, #0xffffffff00000000
-; CHECK-NEXT:	bfi	w10, w9, #8, #24
-; CHECK-NEXT:	orr	x11, x11, x12, lsl #16
-; CHECK-NEXT:	orr	x0, x11, x10
-; CHECK-NEXT:	cbnz	x13, .LBB0_1
+; CHECK-NEXT:    adrp x9, :got:global
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ldr x9, [x9, :got_lo12:global]
+; CHECK-NEXT:    mov w10, #65536 // =0x10000
+; CHECK-NEXT:    ldr x9, [x9]
+; CHECK-NEXT:  .LBB0_1: // %midblock
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrh w11, [x9, #72]
+; CHECK-NEXT:    and x13, x0, #0xffffffff00000000
+; CHECK-NEXT:    lsr w12, w11, #8
+; CHECK-NEXT:    cmp w11, #0
+; CHECK-NEXT:    csel w8, w8, w12, eq
+; CHECK-NEXT:    ldr x12, [x9, #8]
+; CHECK-NEXT:    csel x9, xzr, x10, eq
+; CHECK-NEXT:    bfi w11, w8, #8, #24
+; CHECK-NEXT:    orr x13, x9, x13
+; CHECK-NEXT:    ldr x9, [x12, #16]
+; CHECK-NEXT:    orr x0, x13, x11
+; CHECK-NEXT:    cbnz x12, .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit
 ; CHECK-NEXT:    ret
 entry:
@@ -80,25 +81,26 @@ exit:
 define i64 @bfis_in_loop_undef() {
 ; CHECK-LABEL: bfis_in_loop_undef:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT: 	adrp	x9, :got:global
-; CHECK-NEXT: 	mov	w8, wzr
-; CHECK-NEXT:                                         // implicit-def: $x0
-; CHECK-NEXT: 	ldr	x9, [x9, :got_lo12:global]
-; CHECK-NEXT: 	ldr	x9, [x9]
-; CHECK-NEXT: .LBB1_1:                                // %midblock
-; CHECK-NEXT:                                         // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:	ldrh	w10, [x9, #72]
-; CHECK-NEXT:	ldr	x13, [x9, #8]
-; CHECK-NEXT:	lsr	w11, w10, #8
-; CHECK-NEXT:	cmp	w10, #0
-; CHECK-NEXT:	ldr	x9, [x13, #16]
-; CHECK-NEXT:	cset	w12, ne
-; CHECK-NEXT:	csel	w8, w8, w11, eq
-; CHECK-NEXT:	and	x11, x0, #0xffffffff00000000
-; CHECK-NEXT:	bfi	w10, w8, #8, #24
-; CHECK-NEXT:	orr	x11, x11, x12, lsl #16
-; CHECK-NEXT:	orr	x0, x11, x10
-; CHECK-NEXT:	cbnz	x13, .LBB1_1
+; CHECK-NEXT:    adrp x9, :got:global
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    // implicit-def: $x0
+; CHECK-NEXT:    ldr x9, [x9, :got_lo12:global]
+; CHECK-NEXT:    ldr x10, [x9]
+; CHECK-NEXT:    mov w9, #65536 // =0x10000
+; CHECK-NEXT:  .LBB1_1: // %midblock
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrh w11, [x10, #72]
+; CHECK-NEXT:    and x13, x0, #0xffffffff00000000
+; CHECK-NEXT:    lsr w12, w11, #8
+; CHECK-NEXT:    cmp w11, #0
+; CHECK-NEXT:    csel w8, w8, w12, eq
+; CHECK-NEXT:    ldr x12, [x10, #8]
+; CHECK-NEXT:    csel x10, xzr, x9, eq
+; CHECK-NEXT:    bfi w11, w8, #8, #24
+; CHECK-NEXT:    orr x13, x10, x13
+; CHECK-NEXT:    ldr x10, [x12, #16]
+; CHECK-NEXT:    orr x0, x13, x11
+; CHECK-NEXT:    cbnz x12, .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %exit
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-all.ll b/llvm/test/CodeGen/AArch64/build-attributes-all.ll
index aecc74b..e2e1e0a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-all.ll
+++ b/llvm/test/CodeGen/AArch64/build-attributes-all.ll
@@ -2,9 +2,9 @@
 ; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
 
 ; ASM:      .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_BTI, 1
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_PAC, 1
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_GCS, 1
+; ASM-NEXT: .aeabi_attribute	0, 1 // Tag_Feature_BTI
+; ASM-NEXT: .aeabi_attribute	1, 1 // Tag_Feature_PAC
+; ASM-NEXT: .aeabi_attribute	2, 1 // Tag_Feature_GCS
 
 ; ELF: Hex dump of section '.ARM.attributes':
 ; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu
diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-bti.ll b/llvm/test/CodeGen/AArch64/build-attributes-bti.ll
index 8ec78df..92ce2ab 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-bti.ll
+++ b/llvm/test/CodeGen/AArch64/build-attributes-bti.ll
@@ -2,9 +2,9 @@
 ; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
 
 ; ASM:      .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_BTI, 1
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_PAC, 0
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_GCS, 0
+; ASM-NEXT: .aeabi_attribute	0, 1 // Tag_Feature_BTI
+; ASM-NEXT: .aeabi_attribute	1, 0 // Tag_Feature_PAC
+; ASM-NEXT: .aeabi_attribute	2, 0 // Tag_Feature_GCS
 
 ; ELF: Hex dump of section '.ARM.attributes':
 ; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu
diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-gcs.ll b/llvm/test/CodeGen/AArch64/build-attributes-gcs.ll
index be52877..faa77d6 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-gcs.ll
+++ b/llvm/test/CodeGen/AArch64/build-attributes-gcs.ll
@@ -2,9 +2,9 @@
 ; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
 
 ; ASM:      .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_BTI, 0
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_PAC, 0
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_GCS, 1
+; ASM-NEXT: .aeabi_attribute	0, 0 // Tag_Feature_BTI
+; ASM-NEXT: .aeabi_attribute	1, 0 // Tag_Feature_PAC
+; ASM-NEXT: .aeabi_attribute	2, 1 // Tag_Feature_GCS
 
 ; ELF: Hex dump of section '.ARM.attributes':
 ; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu
diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pac.ll b/llvm/test/CodeGen/AArch64/build-attributes-pac.ll
index e3e5933..0358927 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pac.ll
+++ b/llvm/test/CodeGen/AArch64/build-attributes-pac.ll
@@ -2,9 +2,9 @@
 ; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
 
 ; ASM:      .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_BTI, 0
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_PAC, 1
-; ASM-NEXT: .aeabi_attribute	Tag_Feature_GCS, 0
+; ASM-NEXT: .aeabi_attribute	0, 0 // Tag_Feature_BTI
+; ASM-NEXT: .aeabi_attribute	1, 1 // Tag_Feature_PAC
+; ASM-NEXT: .aeabi_attribute	2, 0 // Tag_Feature_GCS
 
 ; ELF: Hex dump of section '.ARM.attributes':
 ; ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu
diff --git a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pauthabi.ll b/llvm/test/CodeGen/AArch64/build-attributes-pauthabi.ll
index 35ad514c..1c2a725 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-build-attributes-pauthabi.ll
+++ b/llvm/test/CodeGen/AArch64/build-attributes-pauthabi.ll
@@ -2,8 +2,8 @@
 ; RUN: llc %s -filetype=obj -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
 
 ; ASM: .aeabi_subsection	aeabi_pauthabi, required, uleb128
-; ASM-NEXT: .aeabi_attribute	Tag_PAuth_Platform, 2
-; ASM-NEXT: .aeabi_attribute	Tag_PAuth_Schema, 31
+; ASM-NEXT: .aeabi_attribute	1, 2 // Tag_PAuth_Platform
+; ASM-NEXT: .aeabi_attribute	2, 31 // Tag_PAuth_Schema
 
 ; ELF: Hex dump of section '.ARM.attributes':
 ; ELF-NEXT: 0x00000000 41190000 00616561 62695f70 61757468 A....aeabi_pauth
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 40daf8f..2496754 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM
 ; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
 ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM
 
 define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-DOT-LABEL: udot:
@@ -27,6 +27,66 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
   ret <4 x i32> %partial.reduce
 }
 
+define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
+; CHECK-DOT-LABEL: udot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q0, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q2, [x1, x8]
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    umull v3.8h, v0.8b, v2.8b
+; CHECK-NODOT-NEXT:    umull2 v2.8h, v0.16b, v2.16b
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    uaddw v4.4s, v0.4s, v3.4h
+; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v3.8h
+; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v4.4s, v2.8h
+; CHECK-NODOT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NODOT-NEXT:    b.ne .LBB1_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ]
+  %gep1 = getelementptr i8, ptr %p1, i64 %index
+  %load1 = load <16 x i8>, ptr %gep1, align 16
+  %load1.wide = zext <16 x i8> %load1 to <16 x i32>
+  %gep2 = getelementptr i8, ptr %p2, i64 %index
+  %load2 = load <16 x i8>, ptr %gep2, align 16
+  %load2.wide = zext <16 x i8> %load2 to <16 x i32>
+  %mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
+  %index.next = add nuw i64 %index, 16
+  %cmp = icmp eq i64 %index.next, 16
+  br i1 %cmp, label %end, label %vector.body
+
+end:
+  ret <4 x i32> %acc
+}
+
 define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-DOT-LABEL: udot_narrow:
 ; CHECK-DOT:       // %bb.0:
@@ -129,6 +189,68 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
   ret <4 x i32> %partial.reduce
 }
 
+define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){
+; CHECK-NOI8MM-LABEL: usdot_in_loop:
+; CHECK-NOI8MM:       // %bb.0: // %entry
+; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NOI8MM-NEXT:    mov x8, xzr
+; CHECK-NOI8MM-NEXT:  .LBB6_1: // %vector.body
+; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NOI8MM-NEXT:    ldr q0, [x0, x8]
+; CHECK-NOI8MM-NEXT:    ldr q2, [x1, x8]
+; CHECK-NOI8MM-NEXT:    add x8, x8, #16
+; CHECK-NOI8MM-NEXT:    cmp x8, #16
+; CHECK-NOI8MM-NEXT:    sshll v3.8h, v0.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v4.8h, v0.16b, #0
+; CHECK-NOI8MM-NEXT:    ushll v5.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-NOI8MM-NEXT:    smlal v1.4s, v3.4h, v5.4h
+; CHECK-NOI8MM-NEXT:    smull v6.4s, v4.4h, v2.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v2.8h
+; CHECK-NOI8MM-NEXT:    smlal2 v6.4s, v3.8h, v5.8h
+; CHECK-NOI8MM-NEXT:    add v1.4s, v6.4s, v1.4s
+; CHECK-NOI8MM-NEXT:    b.ne .LBB6_1
+; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
+; CHECK-NOI8MM-NEXT:    ret
+;
+; CHECK-I8MM-LABEL: usdot_in_loop:
+; CHECK-I8MM:       // %bb.0: // %entry
+; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-I8MM-NEXT:    mov x8, xzr
+; CHECK-I8MM-NEXT:  .LBB6_1: // %vector.body
+; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-I8MM-NEXT:    add x8, x8, #16
+; CHECK-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
+; CHECK-I8MM-NEXT:    cmp x8, #16
+; CHECK-I8MM-NEXT:    b.ne .LBB6_1
+; CHECK-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-I8MM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ]
+  %gep1 = getelementptr i8, ptr %p1, i64 %index
+  %load1 = load <16 x i8>, ptr %gep1, align 16
+  %load1.wide = sext <16 x i8> %load1 to <16 x i32>
+  %gep2 = getelementptr i8, ptr %p2, i64 %index
+  %load2 = load <16 x i8>, ptr %gep2, align 16
+  %load2.wide = zext <16 x i8> %load2 to <16 x i32>
+  %mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
+  %index.next = add nuw i64 %index, 16
+  %cmp = icmp eq i64 %index.next, 16
+  br i1 %cmp, label %end, label %vector.body
+
+end:
+  ret <4 x i32> %acc
+}
+
 define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-NOI8MM-LABEL: usdot_narrow:
 ; CHECK-NOI8MM:       // %bb.0:
@@ -176,13 +298,75 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
 ; CHECK-I8MM:       // %bb.0:
 ; CHECK-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
 ; CHECK-I8MM-NEXT:    ret
-  %u.wide = sext <16 x i8> %u to <16 x i32>
-  %s.wide = zext <16 x i8> %s to <16 x i32>
-  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
+  %s.wide = sext <16 x i8> %u to <16 x i32>
+  %u.wide = zext <16 x i8> %s to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %u.wide, %s.wide
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
   ret <4 x i32> %partial.reduce
 }
 
+define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){
+; CHECK-NOI8MM-LABEL: sudot_in_loop:
+; CHECK-NOI8MM:       // %bb.0: // %entry
+; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NOI8MM-NEXT:    mov x8, xzr
+; CHECK-NOI8MM-NEXT:  .LBB9_1: // %vector.body
+; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NOI8MM-NEXT:    ldr q0, [x0, x8]
+; CHECK-NOI8MM-NEXT:    ldr q2, [x1, x8]
+; CHECK-NOI8MM-NEXT:    add x8, x8, #16
+; CHECK-NOI8MM-NEXT:    cmp x8, #16
+; CHECK-NOI8MM-NEXT:    ushll v3.8h, v0.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v4.8h, v0.16b, #0
+; CHECK-NOI8MM-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-NOI8MM-NEXT:    smlal v1.4s, v3.4h, v5.4h
+; CHECK-NOI8MM-NEXT:    smull v6.4s, v4.4h, v2.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v2.8h
+; CHECK-NOI8MM-NEXT:    smlal2 v6.4s, v3.8h, v5.8h
+; CHECK-NOI8MM-NEXT:    add v1.4s, v6.4s, v1.4s
+; CHECK-NOI8MM-NEXT:    b.ne .LBB9_1
+; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
+; CHECK-NOI8MM-NEXT:    ret
+;
+; CHECK-I8MM-LABEL: sudot_in_loop:
+; CHECK-I8MM:       // %bb.0: // %entry
+; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-I8MM-NEXT:    mov x8, xzr
+; CHECK-I8MM-NEXT:  .LBB9_1: // %vector.body
+; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-I8MM-NEXT:    add x8, x8, #16
+; CHECK-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
+; CHECK-I8MM-NEXT:    cmp x8, #16
+; CHECK-I8MM-NEXT:    b.ne .LBB9_1
+; CHECK-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-I8MM-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ]
+  %gep1 = getelementptr i8, ptr %p1, i64 %index
+  %load1 = load <16 x i8>, ptr %gep1, align 16
+  %load1.wide = zext <16 x i8> %load1 to <16 x i32>
+  %gep2 = getelementptr i8, ptr %p2, i64 %index
+  %load2 = load <16 x i8>, ptr %gep2, align 16
+  %load2.wide = sext <16 x i8> %load2 to <16 x i32>
+  %mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
+  %index.next = add nuw i64 %index, 16
+  %cmp = icmp eq i64 %index.next, 16
+  br i1 %cmp, label %end, label %vector.body
+
+end:
+  ret <4 x i32> %acc
+}
+
 define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-NOI8MM-LABEL: sudot_narrow:
 ; CHECK-NOI8MM:       // %bb.0:
@@ -390,6 +574,62 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
   ret <4 x i32> %partial.reduce
 }
 
+define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
+; CHECK-DOT-LABEL: udot_no_bin_op_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB16_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q3, [x0, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    udot v1.4s, v3.16b, v2.16b
+; CHECK-DOT-NEXT:    b.ne .LBB16_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_no_bin_op_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB16_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q0, [x0, x8]
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v0.16b, #0
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    ushll v1.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    uaddw v4.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
+; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v4.4s, v3.8h
+; CHECK-NODOT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NODOT-NEXT:    b.ne .LBB16_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
+
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ]
+  %gep = getelementptr i8, ptr %p, i64 %index
+  %load = load <16 x i8>, ptr %gep, align 16
+  %load.wide = zext <16 x i8> %load to <16 x i32>
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %load.wide)
+  %index.next = add nuw i64 %index, 16
+  %cmp = icmp eq i64 %index.next, 16
+  br i1 %cmp, label %end, label %vector.body
+
+end:
+  ret <4 x i32> %acc
+}
+
 define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-DOT-LABEL: sdot_no_bin_op:
 ; CHECK-DOT:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/select_cc.ll b/llvm/test/CodeGen/AArch64/select_cc.ll
index 6a00878..6feaabe 100644
--- a/llvm/test/CodeGen/AArch64/select_cc.ll
+++ b/llvm/test/CodeGen/AArch64/select_cc.ll
@@ -6,8 +6,8 @@ define i64 @select_ogt_float(float %a, float %b) {
 ; CHECK-SD-LABEL: select_ogt_float:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w8, gt
-; CHECK-SD-NEXT:    ubfiz x0, x8, #2, #32
+; CHECK-SD-NEXT:    mov w8, #4 // =0x4
+; CHECK-SD-NEXT:    csel x0, x8, xzr, gt
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: select_ogt_float:
@@ -26,8 +26,8 @@ define i64 @select_ule_float_inverse(float %a, float %b) {
 ; CHECK-SD-LABEL: select_ule_float_inverse:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w8, gt
-; CHECK-SD-NEXT:    ubfiz x0, x8, #2, #32
+; CHECK-SD-NEXT:    mov w8, #4 // =0x4
+; CHECK-SD-NEXT:    csel x0, xzr, x8, le
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: select_ule_float_inverse:
@@ -45,9 +45,9 @@ entry:
 define i64 @select_eq_i32(i32 %a, i32 %b) {
 ; CHECK-SD-LABEL: select_eq_i32:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #4 // =0x4
 ; CHECK-SD-NEXT:    cmp w0, w1
-; CHECK-SD-NEXT:    cset w8, eq
-; CHECK-SD-NEXT:    ubfiz x0, x8, #2, #32
+; CHECK-SD-NEXT:    csel x0, x8, xzr, eq
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: select_eq_i32:
@@ -65,9 +65,9 @@ entry:
 define i64 @select_ne_i32_inverse(i32 %a, i32 %b) {
 ; CHECK-SD-LABEL: select_ne_i32_inverse:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #4 // =0x4
 ; CHECK-SD-NEXT:    cmp w0, w1
-; CHECK-SD-NEXT:    cset w8, eq
-; CHECK-SD-NEXT:    ubfiz x0, x8, #2, #32
+; CHECK-SD-NEXT:    csel x0, xzr, x8, ne
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: select_ne_i32_inverse:
diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll
index f10327e1..a44c746 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-const.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll
@@ -13,24 +13,24 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
 ; CHECK-NEXT:    mov w8, w3
 ; CHECK-NEXT:    movk w9, #16309, lsl #16
 ; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    .p2align 5, , 16
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr s4, [x1], #4
-; CHECK-NEXT:    ldr w9, [x0], #4
-; CHECK-NEXT:    add w9, w9, #10
-; CHECK-NEXT:    scvtf d3, w9
+; CHECK-NEXT:    ldr w10, [x0], #4
+; CHECK-NEXT:    add w10, w10, #10
+; CHECK-NEXT:    scvtf d3, w10
 ; CHECK-NEXT:    fmadd s4, s4, s0, s1
 ; CHECK-NEXT:    fabs s4, s4
 ; CHECK-NEXT:    fcvt d4, s4
 ; CHECK-NEXT:    fdiv d3, d3, d4
 ; CHECK-NEXT:    fcmp d3, d2
-; CHECK-NEXT:    cset w9, lt
+; CHECK-NEXT:    csel x10, x9, xzr, lt
 ; CHECK-NEXT:    subs x8, x8, #1
-; CHECK-NEXT:    ubfiz x9, x9, #4, #32
-; CHECK-NEXT:    ldr s3, [x4, x9]
-; CHECK-NEXT:    fcvtzs w9, s3
-; CHECK-NEXT:    str w9, [x2], #4
+; CHECK-NEXT:    ldr s3, [x4, x10]
+; CHECK-NEXT:    fcvtzs w10, s3
+; CHECK-NEXT:    str w10, [x2], #4
 ; CHECK-NEXT:    b.ne .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: // %for.cond.cleanup
 ; CHECK-NEXT:    mov w0, wzr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 3d7fec9..2389924 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
 
 define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i7:
@@ -35,14 +36,32 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_uaddsat_i7:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 9, v1
-; GFX10PLUS-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_uaddsat_i7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b16 v0, 9, v0
+; GFX10-NEXT:    v_lshlrev_b16 v1, 9, v1
+; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT:    v_lshrrev_b16 v0, 9, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_uaddsat_i7:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 9, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uaddsat_i7:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 9, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
   ret i7 %result
 }
@@ -78,14 +97,32 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: s_uaddsat_i7:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 9
-; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 9
-; GFX10PLUS-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: s_uaddsat_i7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 9
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 9
+; GFX10-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT:    v_lshrrev_b16 v0, 9, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_uaddsat_i7:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 9
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_uaddsat_i7:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 9
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
   ret i7 %result
 }
@@ -120,14 +157,32 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_uaddsat_i8:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10PLUS-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_uaddsat_i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
+; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_uaddsat_i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uaddsat_i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
 }
@@ -163,14 +218,32 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: s_uaddsat_i8:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10PLUS-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: s_uaddsat_i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX10-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_uaddsat_i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_uaddsat_i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
 }
@@ -247,25 +320,40 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_uaddsat_v2i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
-; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uaddsat_v2i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uaddsat_v2i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i16 %lhs.arg to <2 x i8>
   %rhs = bitcast i16 %rhs.arg to <2 x i8>
   %result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -358,29 +446,50 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_uaddsat_v2i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s3, s1, 8
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
-; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: s_uaddsat_v2i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_uaddsat_v2i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %lhs = bitcast i16 %lhs.arg to <2 x i8>
   %rhs = bitcast i16 %rhs.arg to <2 x i8>
   %result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -524,36 +633,69 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_uaddsat_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
-; GFX11-NEXT:    v_alignbit_b32 v0, v6, v0, 16
-; GFX11-NEXT:    v_alignbit_b32 v1, v7, v1, 16
-; GFX11-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_u16 v2, v2, v3 clamp
-; GFX11-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
-; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 8
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
-; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uaddsat_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v7.l
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v1, 8, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v2, v1 clamp
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-TRUE16-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uaddsat_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v6, v0, 16
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v7, v1, 16
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v2, v2, v3 clamp
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
+; GFX11-FAKE16-NEXT:    v_or3_b32 v0, v1, v3, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
   %rhs = bitcast i32 %rhs.arg to <4 x i8>
   %result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
@@ -729,46 +871,89 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_uaddsat_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
-; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s1, 24
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s0, s2
-; GFX11-NEXT:    s_pack_hl_b32_b16 s0, s0, s3
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s1, s4
-; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    s_pack_hl_b32_b16 s1, s1, s5
-; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-NEXT:    v_pk_add_u16 v0, s2, s3 clamp
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s2, s5, 8
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_u16 v1, s0, s1 clamp
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 8
-; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: s_uaddsat_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s0, s2
+; GFX11-TRUE16-NEXT:    s_pack_hl_b32_b16 s0, s0, s3
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s1, s4
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX11-TRUE16-NEXT:    s_pack_hl_b32_b16 s1, s1, s5
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s2, s3 clamp
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s5, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s0, s1 clamp
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-TRUE16-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_uaddsat_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s0, s2
+; GFX11-FAKE16-NEXT:    s_pack_hl_b32_b16 s0, s0, s3
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s1, s4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX11-FAKE16-NEXT:    s_pack_hl_b32_b16 s1, s1, s5
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v0, s2, s3 clamp
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s5, 8
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v1, s0, s1 clamp
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 8
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX11-FAKE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
   %rhs = bitcast i32 %rhs.arg to <4 x i8>
   %result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
@@ -1761,11 +1946,23 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX9-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_uaddsat_i16:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_uaddsat_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_uaddsat_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l clamp
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uaddsat_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
 }
@@ -1795,11 +1992,23 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: s_uaddsat_i16:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: s_uaddsat_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_uaddsat_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_uaddsat_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
 }
@@ -1825,10 +2034,20 @@ define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX9-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: uaddsat_i16_sv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_add_nc_u16 v0, s0, v0 clamp
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: uaddsat_i16_sv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u16 v0, s0, v0 clamp
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: uaddsat_i16_sv:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, s0, v0.l clamp
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: uaddsat_i16_sv:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, s0, v0 clamp
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
   %cast = bitcast i16 %result to half
   ret half %cast
@@ -1855,10 +2074,20 @@ define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
 ; GFX9-NEXT:    v_add_u16_e64 v0, v0, s0 clamp
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: uaddsat_i16_vs:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_add_nc_u16 v0, v0, s0 clamp
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: uaddsat_i16_vs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u16 v0, v0, s0 clamp
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: uaddsat_i16_vs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, s0 clamp
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: uaddsat_i16_vs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v0, s0 clamp
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
   %cast = bitcast i16 %result to half
   ret half %cast
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 5a8b5fc..34d3658 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
 
 define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-LABEL: v_usubsat_i7:
@@ -34,14 +35,32 @@ define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_usubsat_i7:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 9, v1
-; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b16 v0, 9, v0
+; GFX10-NEXT:    v_lshlrev_b16 v1, 9, v1
+; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT:    v_lshrrev_b16 v0, 9, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_usubsat_i7:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 9, v1.l
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_usubsat_i7:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 9, v1
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
   ret i7 %result
 }
@@ -76,14 +95,32 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: s_usubsat_i7:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 9
-; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 9
-; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 9, v0
-; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: s_usubsat_i7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 9
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 9
+; GFX10-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT:    v_lshrrev_b16 v0, 9, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_usubsat_i7:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 9
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 9
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 9, v0.l
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_usubsat_i7:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 9
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 9
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v0, 9, v0
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
   ret i7 %result
 }
@@ -117,14 +154,32 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_usubsat_i8:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
+; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_usubsat_i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_usubsat_i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
 }
@@ -159,14 +214,32 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: s_usubsat_i8:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX10PLUS-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 8, v0
-; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: s_usubsat_i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX10-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_usubsat_i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 8, v0.l
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_usubsat_i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
 }
@@ -241,25 +314,40 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_usubsat_v2i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
-; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_usubsat_v2i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_usubsat_v2i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i16 %lhs.arg to <2 x i8>
   %rhs = bitcast i16 %rhs.arg to <2 x i8>
   %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -350,29 +438,50 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_usubsat_v2i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s3, s1, 8
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
-; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: s_usubsat_v2i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-TRUE16-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_usubsat_v2i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s1, 8
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-FAKE16-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %lhs = bitcast i16 %lhs.arg to <2 x i8>
   %rhs = bitcast i16 %rhs.arg to <2 x i8>
   %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -512,36 +621,69 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_usubsat_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
-; GFX11-NEXT:    v_alignbit_b32 v0, v6, v0, 16
-; GFX11-NEXT:    v_alignbit_b32 v1, v7, v1, 16
-; GFX11-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_sub_u16 v2, v2, v3 clamp
-; GFX11-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
-; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 8
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
-; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_usubsat_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v7.l
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v1, 8, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_sub_u16 v1, v2, v1 clamp
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-TRUE16-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_usubsat_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v6, v0, 16
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v7, v1, 16
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_sub_u16 v2, v2, v3 clamp
+; GFX11-FAKE16-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
+; GFX11-FAKE16-NEXT:    v_or3_b32 v0, v1, v3, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
   %rhs = bitcast i32 %rhs.arg to <4 x i8>
   %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
@@ -713,46 +855,89 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_usubsat_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s2, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
-; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s1, 24
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s0, s2
-; GFX11-NEXT:    s_pack_hl_b32_b16 s0, s0, s3
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s1, s4
-; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    s_pack_hl_b32_b16 s1, s1, s5
-; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-NEXT:    v_pk_sub_u16 v0, s2, s3 clamp
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT:    s_lshl_b32 s2, s5, 8
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_sub_u16 v1, s0, s1 clamp
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 8
-; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: s_usubsat_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s0, s2
+; GFX11-TRUE16-NEXT:    s_pack_hl_b32_b16 s0, s0, s3
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s1, s4
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX11-TRUE16-NEXT:    s_pack_hl_b32_b16 s1, s1, s5
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX11-TRUE16-NEXT:    v_pk_sub_u16 v0, s2, s3 clamp
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s5, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_sub_u16 v1, s0, s1 clamp
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-TRUE16-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_usubsat_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s1, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s0, s2
+; GFX11-FAKE16-NEXT:    s_pack_hl_b32_b16 s0, s0, s3
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s1, s4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX11-FAKE16-NEXT:    s_pack_hl_b32_b16 s1, s1, s5
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX11-FAKE16-NEXT:    v_pk_sub_u16 v0, s2, s3 clamp
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 0x80008
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s5, 8
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_sub_u16 v1, s0, s1 clamp
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 8
+; GFX11-FAKE16-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX11-FAKE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
   %rhs = bitcast i32 %rhs.arg to <4 x i8>
   %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
@@ -1678,11 +1863,23 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_usubsat_i16:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_usubsat_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v1.l clamp
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_usubsat_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
 }
@@ -1711,11 +1908,23 @@ define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: s_usubsat_i16:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: s_usubsat_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_usubsat_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, s0, s1 clamp
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_usubsat_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, s0, s1 clamp
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
 }
@@ -1740,10 +1949,20 @@ define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
 ; GFX9-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i16_sv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, s0, v0 clamp
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i16_sv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_nc_u16 v0, s0, v0 clamp
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: usubsat_i16_sv:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, s0, v0.l clamp
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: usubsat_i16_sv:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, s0, v0 clamp
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
   %cast = bitcast i16 %result to half
   ret half %cast
@@ -1769,10 +1988,20 @@ define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
 ; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s0 clamp
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i16_vs:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, s0 clamp
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i16_vs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_nc_u16 v0, v0, s0 clamp
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: usubsat_i16_vs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, s0 clamp
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: usubsat_i16_vs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, v0, s0 clamp
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
   %cast = bitcast i16 %result to half
   ret half %cast
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
index a3f27eb..c9a4379 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@@ -19,8 +19,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; GCN-ALLOCA: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, v{{[0-9]+}}, v0
 
 ; GCN-PROMOTE: s_cmp_eq_u32 [[IN]], 1
-; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-PROMOTE-NEXT: v_addc_u32_e32 [[RESULT:v[0-9]+]], vcc, 0, v0, vcc
+; GCN-PROMOTE-NEXT: s_cselect_b32 [[SCC:s[0-9]+]], 1, 0
+; GCN-PROMOTE-NEXT: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, [[SCC]], v0
 
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @work_item_info(ptr addrspace(1) %out, i32 %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index ab23638..54b4888 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -82,17 +82,16 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
 ; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; CHECK-NEXT:    s_mov_b32 s4, 0x800000
 ; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
 ; CHECK-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
 ; CHECK-NEXT:    v_ldexp_f32 v3, |v0|, v3
 ; CHECK-NEXT:    v_log_f32_e32 v3, v3
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v4, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; CHECK-NEXT:    v_cvt_f32_i32_e32 v4, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
-; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v4
 ; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v4
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 0x42800000
 ; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
@@ -228,8 +227,7 @@ define float @test_powr_fast_f32(float %x, float %y) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x800000
 ; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
 ; CHECK-NEXT:    v_ldexp_f32 v0, v0, v3
 ; CHECK-NEXT:    v_log_f32_e32 v0, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
@@ -368,8 +366,7 @@ define float @test_pown_fast_f32(float %x, i32 %y) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x800000
 ; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
 ; CHECK-NEXT:    v_ldexp_f32 v3, |v0|, v3
 ; CHECK-NEXT:    v_log_f32_e32 v3, v3
 ; CHECK-NEXT:    v_cvt_f32_i32_e32 v4, v1
@@ -511,8 +508,7 @@ define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x800000
 ; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
 ; CHECK-NEXT:    v_ldexp_f32 v0, |v0|, v3
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; CHECK-NEXT:    v_log_f32_e32 v0, v0
@@ -651,8 +647,7 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x800000
 ; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
 ; CHECK-NEXT:    v_ldexp_f32 v3, |v0|, v3
 ; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
 ; CHECK-NEXT:    v_log_f32_e32 v3, v3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
index ad646a3..e2510bb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@@ -53,51 +53,6 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_size_x:
-; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x3
-; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0xc
-; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NOHSA: buffer_store_dword [[VVAL]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
-define amdgpu_kernel void @global_size_x (ptr addrspace(1) %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.global.size.x() #0
-  store i32 %0, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_size_y:
-; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x4
-; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x10
-; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NOHSA: buffer_store_dword [[VVAL]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
-define amdgpu_kernel void @global_size_y (ptr addrspace(1) %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.global.size.y() #0
-  store i32 %0, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}global_size_z:
-; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x5
-; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x14
-; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NOHSA: buffer_store_dword [[VVAL]]
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
-define amdgpu_kernel void @global_size_z (ptr addrspace(1) %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.global.size.z() #0
-  store i32 %0, ptr addrspace(1) %out
-  ret void
-}
-
 ; FUNC-LABEL: {{^}}local_size_x:
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x6
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x18
@@ -147,10 +102,6 @@ declare i32 @llvm.r600.read.ngroups.x() #0
 declare i32 @llvm.r600.read.ngroups.y() #0
 declare i32 @llvm.r600.read.ngroups.z() #0
 
-declare i32 @llvm.r600.read.global.size.x() #0
-declare i32 @llvm.r600.read.global.size.y() #0
-declare i32 @llvm.r600.read.global.size.z() #0
-
 declare i32 @llvm.r600.read.local.size.x() #0
 declare i32 @llvm.r600.read.local.size.y() #0
 declare i32 @llvm.r600.read.local.size.z() #0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index b657223..aafdb1c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -27748,8 +27748,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_log_f32_e32 v0, v0
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
@@ -27775,8 +27774,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_log_f32_e32 v0, v0
 ; GFX7-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -27800,8 +27798,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -27834,8 +27831,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -27864,8 +27860,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -27890,30 +27885,28 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log.bf16(bfloat %a)
@@ -27929,8 +27922,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_log_f32_e32 v0, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -27945,8 +27937,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_log_f32_e32 v0, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -27961,8 +27952,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -27983,8 +27973,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -28004,9 +27993,8 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -28024,21 +28012,20 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log2.bf16(bfloat %a)
@@ -28055,8 +28042,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_log_f32_e32 v0, v0
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
@@ -28082,8 +28068,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_log_f32_e32 v0, v0
 ; GFX7-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -28107,8 +28092,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -28141,8 +28125,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -28171,8 +28154,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -28197,30 +28179,28 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log10.bf16(bfloat %a)
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index fdc9704..a01c2fa 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -337,8 +337,7 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v3, |v0|, v3
 ; GFX9-NEXT:    v_log_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 935ae48..993f162 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1278,84 +1278,44 @@ define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_f64_test12:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_f64_test12:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_f64_test12:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_f64_test12:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_f64_test12:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v3
-; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_f64_test12:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_f64_test12:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_f64_test12:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_f64_test12:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3
-; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test12:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_f64_test12:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_f64_test12:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0.000000e+00, double -0.000000e+00
   %ldexp = fmul double %x, %y
@@ -3137,11 +3097,11 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX7-LABEL: fmul_select_bf16_test8:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -3149,10 +3109,10 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-LABEL: fmul_select_bf16_test8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff8000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -3169,8 +3129,7 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffff8000, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -3186,19 +3145,17 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffff8000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b16 v1, 15, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 67f2487..22bf6cd 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -29,20 +29,18 @@ entry:
 define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-LABEL: int4_extelt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_eq_u32 s6, 1
-; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s6, 2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s6, 3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 2, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 4, v0, vcc
+; GCN-NEXT:    s_cmp_eq_u32 s2, 1
+; GCN-NEXT:    s_cselect_b32 s3, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s2, 2
+; GCN-NEXT:    s_cselect_b32 s3, s3, 2
+; GCN-NEXT:    s_cmp_lg_u32 s2, 3
+; GCN-NEXT:    s_cselect_b32 s2, s3, 4
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index a94f27a..e1ecd34 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -98,7 +98,9 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index a457338..fec04a2 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1529,42 +1529,35 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; SI-NEXT:    v_med3_i32 v1, s6, 0, 13
 ; SI-NEXT:    s_or_b32 s4, s2, 0x1000
 ; SI-NEXT:    v_readfirstlane_b32 s6, v1
-; SI-NEXT:    s_lshr_b32 s6, s4, s6
-; SI-NEXT:    v_lshl_b32_e32 v1, s6, v1
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-NEXT:    s_add_i32 s8, s5, 0xfffffc10
-; SI-NEXT:    v_readfirstlane_b32 s4, v1
-; SI-NEXT:    s_lshl_b32 s5, s8, 12
-; SI-NEXT:    s_or_b32 s4, s6, s4
-; SI-NEXT:    s_or_b32 s5, s2, s5
-; SI-NEXT:    s_cmp_lt_i32 s8, 1
-; SI-NEXT:    s_cselect_b32 s9, s4, s5
-; SI-NEXT:    s_and_b32 s6, s9, 7
+; SI-NEXT:    s_lshr_b32 s7, s4, s6
+; SI-NEXT:    s_lshl_b32 s6, s7, s6
+; SI-NEXT:    s_cmp_lg_u32 s6, s4
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_addk_i32 s5, 0xfc10
+; SI-NEXT:    s_lshl_b32 s6, s5, 12
+; SI-NEXT:    s_or_b32 s4, s7, s4
+; SI-NEXT:    s_or_b32 s6, s2, s6
+; SI-NEXT:    s_cmp_lt_i32 s5, 1
+; SI-NEXT:    s_cselect_b32 s4, s4, s6
+; SI-NEXT:    s_and_b32 s6, s4, 7
 ; SI-NEXT:    s_cmp_gt_i32 s6, 5
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_cselect_b32 s7, 1, 0
 ; SI-NEXT:    s_cmp_eq_u32 s6, 3
-; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; SI-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; SI-NEXT:    s_lshr_b32 s6, s9, 2
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_addc_u32 s4, s6, 0
-; SI-NEXT:    s_cmp_lt_i32 s8, 31
-; SI-NEXT:    s_cselect_b32 s6, s4, 0x7c00
+; SI-NEXT:    s_cselect_b32 s6, 1, 0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    s_lshr_b32 s4, s4, 2
+; SI-NEXT:    s_add_i32 s4, s4, s6
+; SI-NEXT:    s_cmp_lt_i32 s5, 31
+; SI-NEXT:    s_cselect_b32 s4, s4, 0x7c00
 ; SI-NEXT:    s_cmp_lg_u32 s2, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 9, v1
-; SI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
-; SI-NEXT:    v_or_b32_e32 v1, 0x7c00, v1
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s2, s3, 16
-; SI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; SI-NEXT:    s_and_b32 s2, s2, 0x8000
-; SI-NEXT:    v_or_b32_e32 v1, s2, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    s_movk_i32 s2, 0x7e00
+; SI-NEXT:    s_cselect_b32 s2, s2, 0x7c00
+; SI-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; SI-NEXT:    s_cselect_b32 s2, s2, s4
+; SI-NEXT:    s_lshr_b32 s3, s3, 16
+; SI-NEXT:    s_and_b32 s3, s3, 0x8000
+; SI-NEXT:    s_or_b32 s2, s3, s2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
 ; SI-NEXT:    s_brev_b32 s2, -2
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_bfi_b32 v0, s2, v1, v0
@@ -1587,47 +1580,42 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT:    s_bfe_u32 s1, s3, 0xb0014
-; VI-NEXT:    v_readfirstlane_b32 s0, v2
-; VI-NEXT:    s_sub_i32 s2, 0x3f1, s1
-; VI-NEXT:    s_or_b32 s5, s5, s0
-; VI-NEXT:    v_med3_i32 v2, s2, 0, 13
-; VI-NEXT:    s_or_b32 s0, s5, 0x1000
-; VI-NEXT:    v_readfirstlane_b32 s2, v2
-; VI-NEXT:    s_lshr_b32 s2, s0, s2
-; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, s0, v2
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT:    s_add_i32 s6, s1, 0xfffffc10
+; VI-NEXT:    s_bfe_u32 s2, s3, 0xb0014
 ; VI-NEXT:    v_readfirstlane_b32 s0, v2
-; VI-NEXT:    s_lshl_b32 s1, s6, 12
-; VI-NEXT:    s_or_b32 s0, s2, s0
+; VI-NEXT:    s_sub_i32 s3, 0x3f1, s2
+; VI-NEXT:    s_or_b32 s0, s5, s0
+; VI-NEXT:    v_med3_i32 v2, s3, 0, 13
+; VI-NEXT:    s_or_b32 s1, s0, 0x1000
+; VI-NEXT:    v_readfirstlane_b32 s3, v2
+; VI-NEXT:    s_lshr_b32 s5, s1, s3
+; VI-NEXT:    s_lshl_b32 s3, s5, s3
+; VI-NEXT:    s_cmp_lg_u32 s3, s1
+; VI-NEXT:    s_cselect_b32 s1, 1, 0
+; VI-NEXT:    s_addk_i32 s2, 0xfc10
+; VI-NEXT:    s_lshl_b32 s3, s2, 12
 ; VI-NEXT:    s_or_b32 s1, s5, s1
-; VI-NEXT:    s_cmp_lt_i32 s6, 1
-; VI-NEXT:    s_cselect_b32 s7, s0, s1
-; VI-NEXT:    s_and_b32 s2, s7, 7
-; VI-NEXT:    s_cmp_gt_i32 s2, 5
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s2, 3
-; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; VI-NEXT:    s_lshr_b32 s2, s7, 2
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_addc_u32 s0, s2, 0
-; VI-NEXT:    s_cmp_lt_i32 s6, 31
-; VI-NEXT:    s_cselect_b32 s2, s0, 0x7c00
-; VI-NEXT:    s_cmp_lg_u32 s5, 0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 9, v2
-; VI-NEXT:    s_cmpk_eq_i32 s6, 0x40f
-; VI-NEXT:    v_or_b32_e32 v2, 0x7c00, v2
-; VI-NEXT:    v_mov_b32_e32 v3, s2
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; VI-NEXT:    s_movk_i32 s0, 0x7fff
+; VI-NEXT:    s_or_b32 s3, s0, s3
+; VI-NEXT:    s_cmp_lt_i32 s2, 1
+; VI-NEXT:    s_cselect_b32 s1, s1, s3
+; VI-NEXT:    s_and_b32 s3, s1, 7
+; VI-NEXT:    s_cmp_gt_i32 s3, 5
+; VI-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-NEXT:    s_cmp_eq_u32 s3, 3
+; VI-NEXT:    s_cselect_b32 s3, 1, 0
+; VI-NEXT:    s_or_b32 s3, s3, s5
+; VI-NEXT:    s_lshr_b32 s1, s1, 2
+; VI-NEXT:    s_add_i32 s1, s1, s3
+; VI-NEXT:    s_cmp_lt_i32 s2, 31
+; VI-NEXT:    s_cselect_b32 s1, s1, 0x7c00
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_movk_i32 s0, 0x7e00
+; VI-NEXT:    s_cselect_b32 s0, s0, 0x7c00
+; VI-NEXT:    s_cmpk_eq_i32 s2, 0x40f
+; VI-NEXT:    s_cselect_b32 s0, s0, s1
+; VI-NEXT:    s_movk_i32 s1, 0x7fff
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
-; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
+; VI-NEXT:    v_bfi_b32 v2, s1, v2, v3
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1646,45 +1634,40 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9-NEXT:    s_bfe_u32 s3, s3, 0xb0014
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    s_sub_i32 s4, 0x3f1, s3
-; GFX9-NEXT:    s_or_b32 s7, s7, s2
-; GFX9-NEXT:    v_med3_i32 v1, s4, 0, 13
-; GFX9-NEXT:    s_or_b32 s2, s7, 0x1000
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    s_lshr_b32 s4, s2, s4
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s4
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, s2, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    s_add_i32 s8, s3, 0xfffffc10
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    s_lshl_b32 s3, s8, 12
-; GFX9-NEXT:    s_or_b32 s2, s4, s2
-; GFX9-NEXT:    s_or_b32 s3, s7, s3
-; GFX9-NEXT:    s_cmp_lt_i32 s8, 1
-; GFX9-NEXT:    s_cselect_b32 s9, s2, s3
-; GFX9-NEXT:    s_and_b32 s4, s9, 7
-; GFX9-NEXT:    s_cmp_gt_i32 s4, 5
-; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 3
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GFX9-NEXT:    s_lshr_b32 s4, s9, 2
-; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9-NEXT:    s_addc_u32 s2, s4, 0
-; GFX9-NEXT:    s_cmp_lt_i32 s8, 31
-; GFX9-NEXT:    s_cselect_b32 s4, s2, 0x7c00
-; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 9, v1
-; GFX9-NEXT:    s_cmpk_eq_i32 s8, 0x40f
-; GFX9-NEXT:    v_or_b32_e32 v1, 0x7c00, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX9-NEXT:    s_sub_i32 s5, 0x3f1, s3
+; GFX9-NEXT:    s_or_b32 s2, s7, s2
+; GFX9-NEXT:    v_med3_i32 v1, s5, 0, 13
+; GFX9-NEXT:    s_or_b32 s4, s2, 0x1000
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX9-NEXT:    s_lshr_b32 s7, s4, s5
+; GFX9-NEXT:    s_lshl_b32 s5, s7, s5
+; GFX9-NEXT:    s_cmp_lg_u32 s5, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX9-NEXT:    s_lshl_b32 s5, s3, 12
+; GFX9-NEXT:    s_or_b32 s4, s7, s4
+; GFX9-NEXT:    s_or_b32 s5, s2, s5
+; GFX9-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX9-NEXT:    s_cselect_b32 s4, s4, s5
+; GFX9-NEXT:    s_and_b32 s5, s4, 7
+; GFX9-NEXT:    s_cmp_gt_i32 s5, 5
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s5, 3
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_or_b32 s5, s5, s7
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 2
+; GFX9-NEXT:    s_add_i32 s4, s4, s5
+; GFX9-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX9-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX9-NEXT:    s_movk_i32 s2, 0x7e00
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0x7c00
+; GFX9-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX9-NEXT:    s_movk_i32 s3, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_bfi_b32 v1, s2, v1, v2
+; GFX9-NEXT:    v_bfi_b32 v1, s3, v1, v2
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1704,51 +1687,48 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX11-NEXT:    s_bfe_u32 s2, s3, 0xb0014
 ; GFX11-NEXT:    s_sub_i32 s3, 0x3f1, s2
-; GFX11-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_med3_i32 v1, s3, 0, 13
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX11-NEXT:    s_lshl_b32 s7, s2, 12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s5, s3, 0x1000
-; GFX11-NEXT:    s_or_b32 s7, s3, s7
-; GFX11-NEXT:    s_lshr_b32 s6, s5, s6
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v1, s6
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX11-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-NEXT:    s_cmp_lt_i32 s2, 1
-; GFX11-NEXT:    s_cselect_b32 s5, s5, s7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b32 s7, s5, s6
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s6, s7, s6
+; GFX11-NEXT:    s_cmp_lg_u32 s6, s5
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX11-NEXT:    s_or_b32 s5, s7, s5
+; GFX11-NEXT:    s_lshl_b32 s6, s2, 12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s6, s3, s6
+; GFX11-NEXT:    s_cmp_lt_i32 s2, 1
+; GFX11-NEXT:    s_cselect_b32 s5, s5, s6
 ; GFX11-NEXT:    s_and_b32 s6, s5, 7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX11-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-NEXT:    s_lshr_b32 s5, s5, 2
 ; GFX11-NEXT:    s_or_b32 s6, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX11-NEXT:    s_addc_u32 s5, s5, 0
+; GFX11-NEXT:    s_add_i32 s5, s5, s6
 ; GFX11-NEXT:    s_cmp_lt_i32 s2, 31
+; GFX11-NEXT:    s_movk_i32 s6, 0x7e00
 ; GFX11-NEXT:    s_cselect_b32 s5, s5, 0x7c00
 ; GFX11-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s3, s6, 0x7c00
 ; GFX11-NEXT:    s_cmpk_eq_i32 s2, 0x40f
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s4
+; GFX11-NEXT:    s_cselect_b32 s2, s3, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s2, v0
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %mag.trunc = fptrunc double %mag to half
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index a324ba3..3983655 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1499,8 +1499,7 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; CODEGEN-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; CODEGEN-IEEE-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CODEGEN-IEEE-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; CODEGEN-IEEE-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; CODEGEN-IEEE-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1535,8 +1534,7 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; IR-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-IEEE-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; IR-IEEE-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; IR-IEEE-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; IR-IEEE-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; IR-IEEE-SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-IEEE-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index a33fd03..eefcf56 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -501,14 +501,12 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX11-GISEL-TRUE16-NEXT:    v_pk_mul_f16 v2, 0x291e291e, v0
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-TRUE16-NEXT:    v_pk_fma_f16 v0, 0x291e291e, v0, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_gt_f16_e64 s0, 0, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_gt_f16_e64 s1, 0, v1.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_gt_f16_e64 s2, 0, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_gt_f16_e64 s1, 0, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_gt_f16_e64 s0, 0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_gt_f16_e64 s2, 0, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index f72b01b8..57c6926 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -7844,8 +7844,11 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
 ; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v3, v2, s[6:7] glc dlc
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.h, 2.0, v1.l
-; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v1.l, 4.0, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v1.l, 4.0, v1.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_f16 v0.l, v0.l, v0.h, v1.l
 ; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index b32630a..f4c5ebd 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -3000,18 +3000,19 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v4, 7, v2
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v4
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
+; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; SI-NEXT:    s_or_b64 vcc, s[4:5], vcc
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x7c00
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; SI-NEXT:    s_movk_i32 s4, 0x40f
-; SI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -3049,18 +3050,19 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v2
 ; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v4
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; VI-NEXT:    s_or_b64 vcc, s[4:5], vcc
-; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; VI-NEXT:    s_movk_i32 s4, 0x40f
-; VI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x8000
@@ -3085,8 +3087,7 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_med3_i32 v2, v4, 0, 13
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -3094,21 +3095,23 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 9, 0x7c00
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v4
-; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
@@ -3149,18 +3152,19 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v4, 7, v2
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v4
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
+; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; SI-NEXT:    s_or_b64 vcc, s[4:5], vcc
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x7c00
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; SI-NEXT:    s_movk_i32 s4, 0x40f
-; SI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -3196,18 +3200,19 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v2
 ; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v4
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; VI-NEXT:    s_or_b64 vcc, s[4:5], vcc
-; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; VI-NEXT:    s_movk_i32 s4, 0x40f
-; VI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x8000
@@ -3229,9 +3234,8 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0xffe, v2, v0
 ; GFX11-NEXT:    v_med3_i32 v2, v4, 0, 13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
@@ -3242,20 +3246,22 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 9, 0x7c00
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@@ -3298,18 +3304,19 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v5, 7, v2
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v5
+; SI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v5, v5, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; SI-NEXT:    s_or_b64 vcc, s[4:5], vcc
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x7c00
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v4
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; SI-NEXT:    v_mov_b32_e32 v6, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; SI-NEXT:    s_movk_i32 s4, 0x40f
-; SI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v4
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
@@ -3349,18 +3356,19 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; VI-NEXT:    v_and_b32_e32 v6, 7, v4
 ; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
-; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v6
+; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
+; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v6, v6, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 2, v4
-; VI-NEXT:    s_or_b64 vcc, s[4:5], vcc
-; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_mov_b32_e32 v6, 0x7c00
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; VI-NEXT:    s_movk_i32 s4, 0x40f
-; VI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x8000
@@ -3383,9 +3391,8 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_or_b32 v2, 0xffe, v3, v2
 ; GFX11-NEXT:    v_med3_i32 v3, v5, 0, 13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v5, 0x1000, v2
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
@@ -3396,27 +3403,29 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0xfffffc10, v4
 ; GFX11-NEXT:    v_lshl_or_b32 v5, v4, 12, v2
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 9, 0x7c00
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 7, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v5
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_or_b32 v3, 0x8000, v5, v2
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
@@ -3456,18 +3465,19 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; SI-NEXT:    v_and_b32_e32 v7, 7, v5
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v7
+; SI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
+; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v7, v7, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
-; SI-NEXT:    s_or_b64 vcc, s[4:5], vcc
-; SI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; SI-NEXT:    v_mov_b32_e32 v7, 0x7c00
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; SI-NEXT:    v_mov_b32_e32 v8, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 9, v4
 ; SI-NEXT:    s_movk_i32 s4, 0x40f
-; SI-NEXT:    v_or_b32_e32 v4, 0x7c00, v4
+; SI-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
@@ -3506,19 +3516,20 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; VI-NEXT:    v_and_b32_e32 v7, 7, v5
 ; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
-; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
-; VI-NEXT:    s_or_b64 vcc, s[4:5], vcc
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
+; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; VI-NEXT:    v_mul_f64 v[2:3], -v[0:1], v[2:3]
+; VI-NEXT:    v_or_b32_e32 v7, v7, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
 ; VI-NEXT:    v_mov_b32_e32 v7, 0x7c00
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v6
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; VI-NEXT:    v_mov_b32_e32 v8, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 9, v4
 ; VI-NEXT:    s_movk_i32 s4, 0x40f
-; VI-NEXT:    v_or_b32_e32 v4, 0x7c00, v4
+; VI-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v6
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x8000
@@ -3537,42 +3548,43 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; GFX11-NEXT:    v_mul_f64 v[2:3], -v[0:1], v[2:3]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 0x3f1, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0xfffffc10, v6
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_and_or_b32 v4, 0xffe, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_med3_i32 v5, v7, 0, 13
 ; GFX11-NEXT:    v_or_b32_e32 v7, 0x1000, v4
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, v5, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v5, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v7
-; GFX11-NEXT:    v_lshl_or_b32 v7, v6, 12, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 9, 0x7c00
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v5, v8, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v5, vcc_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0xfffffc10, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v5, 7, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v5
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo
+; GFX11-NEXT:    v_lshl_or_b32 v7, v6, 12, v4
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 7, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
+; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX11-NEXT:    v_dual_mov_b32 v7, 0x7e00 :: v_dual_add_nc_u32 v0, v5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7c00, v7, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -3613,18 +3625,19 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v4, 7, v2
 ; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v4
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
+; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; SI-NEXT:    s_or_b64 vcc, s[4:5], vcc
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x7c00
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; SI-NEXT:    s_movk_i32 s4, 0x40f
-; SI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -3662,18 +3675,19 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v2
 ; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 3, v4
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; VI-NEXT:    s_or_b64 vcc, s[4:5], vcc
-; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
 ; VI-NEXT:    s_movk_i32 s4, 0x40f
-; VI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x8000
@@ -3696,9 +3710,8 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0xffe, v2, v0
 ; GFX11-NEXT:    v_med3_i32 v2, v4, 0, 13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
@@ -3709,20 +3722,22 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 9, 0x7c00
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index b2d30b7..e1791da 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -828,9 +828,9 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GCN-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -839,14 +839,14 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 1, v0
-; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v3 :: v_dual_cndmask_b32 v1, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 31, v5
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 0x80000000, vcc_lo
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %i = and i32 %arg, 1
   %i3 = icmp eq i32 %i, 0
@@ -907,12 +907,13 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 31, v2
-; GFX7-NEXT:    v_xor_b32_e32 v0, v2, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -922,9 +923,9 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX9-NEXT:    v_and_b32_e32 v3, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; GFX9-NEXT:    v_cmp_lt_i16_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 15, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; GFX9-NEXT:    v_xor_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
@@ -934,16 +935,14 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 1, v0
-; GFX11-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    v_lshlrev_b16 v0, 15, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 0xffff8000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %i = and i16 %arg, 1
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 3d3e8be..4bab6ea 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -97,59 +97,53 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_movk_i32 s2, 0x7e00
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_lshr_b32 s4, s7, 8
-; SI-NEXT:    s_and_b32 s5, s7, 0x1ff
-; SI-NEXT:    s_and_b32 s8, s4, 0xffe
-; SI-NEXT:    s_or_b32 s4, s5, s6
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SI-NEXT:    s_bfe_u32 s4, s7, 0xb0014
-; SI-NEXT:    v_readfirstlane_b32 s5, v0
-; SI-NEXT:    s_sub_i32 s6, 0x3f1, s4
-; SI-NEXT:    s_add_i32 s10, s4, 0xfffffc10
-; SI-NEXT:    s_or_b32 s11, s8, s5
+; SI-NEXT:    s_lshr_b32 s0, s7, 8
+; SI-NEXT:    s_and_b32 s1, s7, 0x1ff
+; SI-NEXT:    s_and_b32 s8, s0, 0xffe
+; SI-NEXT:    s_or_b32 s0, s1, s6
+; SI-NEXT:    s_cmp_lg_u32 s0, 0
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
+; SI-NEXT:    v_readfirstlane_b32 s1, v0
+; SI-NEXT:    s_sub_i32 s6, 0x3f1, s0
+; SI-NEXT:    s_or_b32 s1, s8, s1
 ; SI-NEXT:    v_med3_i32 v0, s6, 0, 13
-; SI-NEXT:    s_lshl_b32 s4, s10, 12
-; SI-NEXT:    s_or_b32 s5, s11, 0x1000
-; SI-NEXT:    v_readfirstlane_b32 s6, v0
-; SI-NEXT:    s_or_b32 s4, s11, s4
-; SI-NEXT:    s_lshr_b32 s6, s5, s6
-; SI-NEXT:    v_lshl_b32_e32 v0, s6, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    v_readfirstlane_b32 s5, v0
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_cmp_lt_i32 s10, 1
-; SI-NEXT:    s_cselect_b32 s6, s5, s4
+; SI-NEXT:    s_or_b32 s6, s1, 0x1000
+; SI-NEXT:    v_readfirstlane_b32 s8, v0
+; SI-NEXT:    s_lshr_b32 s9, s6, s8
+; SI-NEXT:    s_lshl_b32 s8, s9, s8
+; SI-NEXT:    s_cmp_lg_u32 s8, s6
+; SI-NEXT:    s_cselect_b32 s6, 1, 0
+; SI-NEXT:    s_addk_i32 s0, 0xfc10
+; SI-NEXT:    s_or_b32 s6, s9, s6
+; SI-NEXT:    s_lshl_b32 s8, s0, 12
+; SI-NEXT:    s_or_b32 s8, s1, s8
+; SI-NEXT:    s_cmp_lt_i32 s0, 1
+; SI-NEXT:    s_cselect_b32 s6, s6, s8
 ; SI-NEXT:    s_and_b32 s8, s6, 7
 ; SI-NEXT:    s_cmp_gt_i32 s8, 5
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_cselect_b32 s9, 1, 0
 ; SI-NEXT:    s_cmp_eq_u32 s8, 3
-; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; SI-NEXT:    s_cselect_b32 s8, 1, 0
 ; SI-NEXT:    s_lshr_b32 s6, s6, 2
-; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_addc_u32 s4, s6, 0
-; SI-NEXT:    s_cmp_lt_i32 s10, 31
-; SI-NEXT:    s_cselect_b32 s6, s4, 0x7c00
-; SI-NEXT:    s_cmp_lg_u32 s11, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SI-NEXT:    s_cmpk_eq_i32 s10, 0x40f
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
-; SI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s4, s7, 16
-; SI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-NEXT:    s_and_b32 s4, s4, 0x8000
-; SI-NEXT:    v_or_b32_e32 v0, s4, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_add_i32 s6, s6, s8
+; SI-NEXT:    s_cmp_lt_i32 s0, 31
+; SI-NEXT:    s_cselect_b32 s6, s6, 0x7c00
+; SI-NEXT:    s_cmp_lg_u32 s1, 0
+; SI-NEXT:    s_cselect_b32 s1, s2, 0x7c00
+; SI-NEXT:    s_cmpk_eq_i32 s0, 0x40f
+; SI-NEXT:    s_cselect_b32 s0, s1, s6
+; SI-NEXT:    s_lshr_b32 s1, s7, 16
+; SI-NEXT:    s_and_b32 s1, s1, 0x8000
+; SI-NEXT:    s_or_b32 s6, s1, s0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -169,46 +163,41 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s5, s7, 0xb0014
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s6, s8, s4
-; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s5
+; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s6, s7, 0xb0014
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s8, s4
+; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s6
 ; VI-SAFE-SDAG-NEXT:    v_med3_i32 v0, s8, 0, 13
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s6, 0x1000
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
 ; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s8, v0
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s8, s4, s8
-; VI-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v0, s8
-; VI-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
-; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SAFE-SDAG-NEXT:    s_add_i32 s10, s5, 0xfffffc10
-; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s5, s10, 12
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s8, s4
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
-; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s11, s4, s5
-; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s11, 7
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s9, s5, s8
+; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s8, s9, s8
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s8, s5
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_addk_i32 s6, 0xfc10
+; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s8, s6, 12
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s9, s5
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s8, s4, s8
+; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s6, 1
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s8
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s5, 7
 ; VI-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s8, 5
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
 ; VI-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s8, 3
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; VI-SAFE-SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s8, s11, 2
-; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; VI-SAFE-SDAG-NEXT:    s_addc_u32 s4, s8, 0
-; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s8, s4, 0x7c00
-; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-SAFE-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
-; VI-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
-; VI-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; VI-SAFE-SDAG-NEXT:    v_mov_b32_e32 v1, s8
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s7, 16
-; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-SDAG-NEXT:    s_and_b32 s4, s4, 0x8000
-; VI-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s4, v0
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s8, s8, s9
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; VI-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s8
+; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s6, 31
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-SAFE-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; VI-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s6, 0x40f
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, s5
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s7, 16
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s5, s4
+; VI-SAFE-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-SAFE-SDAG-NEXT:    s_endpgm
 ;
@@ -299,45 +288,41 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX10-SAFE-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
 ; GFX10-SAFE-SDAG-NEXT:    s_sub_i32 s5, 0x3f1, s2
-; GFX10-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX10-SAFE-SDAG-NEXT:    v_med3_i32 v1, s5, 0, 13
 ; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s7, s2, 12
 ; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s5
 ; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s7, s4, s7
-; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s6, s5, s6
-; GFX10-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v1, s6
-; GFX10-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s7, s5, s6
+; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s7, s6
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, s5
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s7, s5
+; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s2, 12
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s4, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s7
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
 ; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
-; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-SAFE-SDAG-NEXT:    s_addc_u32 s5, s5, 0
+; GFX10-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
+; GFX10-SAFE-SDAG-NEXT:    s_movk_i32 s6, 0x7e00
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s6, 0x7c00
 ; GFX10-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s2, 0x40f
-; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s2, s4, s5
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s2, s3, s2
 ; GFX10-SAFE-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s2, s2, 0x8000
-; GFX10-SAFE-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
-; GFX10-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
-; GFX10-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX10-SAFE-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
 ; GFX10-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-SAFE-SDAG-NEXT:    s_endpgm
@@ -430,53 +415,50 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX11-SAFE-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
 ; GFX11-SAFE-SDAG-NEXT:    s_sub_i32 s5, 0x3f1, s2
-; GFX11-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-SDAG-NEXT:    v_med3_i32 v1, s5, 0, 13
 ; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s7, s2, 12
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s7, s4, s7
-; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s6, s5, s6
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v1, s6
-; GFX11-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s7
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s7, s5, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s7, s6
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, s5
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s7, s5
+; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s2, 12
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s4, s6
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
 ; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX11-SAFE-SDAG-NEXT:    s_addc_u32 s5, s5, 0
+; GFX11-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
+; GFX11-SAFE-SDAG-NEXT:    s_movk_i32 s6, 0x7e00
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s6, 0x7c00
 ; GFX11-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s2, 0x40f
-; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s2, s4, s5
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-SAFE-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s2, s2, 0x8000
-; GFX11-SAFE-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX11-SAFE-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
 ; GFX11-SAFE-SDAG-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-SAFE-SDAG-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index c1d5b58..87c7cce 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -1837,8 +1837,7 @@ define float @v_sqrt_f32_ulp2(float %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1874,8 +1873,7 @@ define float @v_sqrt_f32_ulp25(float %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1911,8 +1909,7 @@ define float @v_sqrt_f32_ulp3(float %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1947,8 +1944,7 @@ define float @v_sqrt_f32_ulp2_fabs(float %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, s[4:5]
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, s[4:5]
@@ -2074,12 +2070,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2218,12 +2212,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s6, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
 ; SDAG-IEEE-NEXT:    v_cmp_lt_f32_e64 s[6:7], |v1|, s6
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[6:7]
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[6:7]
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e64 v1, |v1|, v2
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2315,8 +2307,7 @@ define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -2404,8 +2395,7 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
@@ -2489,8 +2479,7 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
@@ -2574,8 +2563,7 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
@@ -2631,12 +2619,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2763,12 +2749,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2900,12 +2884,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -3026,8 +3008,7 @@ define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x)
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3062,8 +3043,7 @@ define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub)
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3098,8 +3078,7 @@ define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3698,8 +3677,7 @@ define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero n
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3734,8 +3712,7 @@ define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3770,8 +3747,7 @@ define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3910,8 +3886,7 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) {
 ; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SDAG-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v0, v1
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 34ee90c..42f0985 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -239,10 +239,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
 ; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -308,10 +308,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
 ; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -377,10 +377,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
 ; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -446,10 +446,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
 ; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -677,11 +677,10 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -703,8 +702,8 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
 ; SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
@@ -874,11 +873,10 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -900,8 +898,8 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
 ; SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
@@ -1102,11 +1100,10 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1128,8 +1125,8 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
 ; SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
@@ -1201,16 +1198,14 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; SDAG-NEXT:    s_mov_b32 s6, 0
 ; SDAG-NEXT:    s_brev_b32 s7, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_mov_b32_e32 v10, 0x100
 ; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
 ; SDAG-NEXT:    v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
 ; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
-; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[6:7]
-; SDAG-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v10, s[6:7]
 ; SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
 ; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; SDAG-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 0adae5e..56a3ce7 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -885,10 +885,8 @@ define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %o
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GENERIC-NEXT:    s_addk_i32 s2, 0xfe00
 ; GENERIC-NEXT:    s_cmp_eq_u32 s2, 1
-; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GENERIC-NEXT:    s_cselect_b32 s4, 1, 0
 ; GENERIC-NEXT:    s_cmp_lg_u32 s2, 2
-; GENERIC-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GENERIC-NEXT:    v_readfirstlane_b32 s4, v0
 ; GENERIC-NEXT:    s_cselect_b32 s4, s4, 2
 ; GENERIC-NEXT:    s_cmp_lg_u32 s2, 3
 ; GENERIC-NEXT:    s_cselect_b32 s4, s4, 3
@@ -3319,57 +3317,56 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GENERIC-NEXT:    s_addk_i32 s6, 0xfe00
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 0
-; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 3
-; GENERIC-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GENERIC-NEXT:    s_cselect_b32 s4, 16, 3
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 2
-; GENERIC-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GENERIC-NEXT:    s_cselect_b32 s5, 16, 2
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 1
 ; GENERIC-NEXT:    v_mov_b32_e32 v3, s4
 ; GENERIC-NEXT:    s_cselect_b32 s4, 16, 1
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 7
+; GENERIC-NEXT:    s_cmp_eq_u32 s6, 0
 ; GENERIC-NEXT:    v_mov_b32_e32 v2, s5
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 7
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 6
+; GENERIC-NEXT:    s_cselect_b32 s5, 16, 0
+; GENERIC-NEXT:    s_cmp_eq_u32 s6, 7
 ; GENERIC-NEXT:    v_mov_b32_e32 v1, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 6
+; GENERIC-NEXT:    s_cselect_b32 s4, 16, 7
+; GENERIC-NEXT:    s_cmp_eq_u32 s6, 6
+; GENERIC-NEXT:    v_mov_b32_e32 v0, s5
+; GENERIC-NEXT:    s_cselect_b32 s5, 16, 6
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 5
-; GENERIC-NEXT:    v_mov_b32_e32 v7, s5
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 5
+; GENERIC-NEXT:    v_mov_b32_e32 v7, s4
+; GENERIC-NEXT:    s_cselect_b32 s4, 16, 5
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 4
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 4
+; GENERIC-NEXT:    v_mov_b32_e32 v6, s5
+; GENERIC-NEXT:    s_cselect_b32 s5, 16, 4
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 11
-; GENERIC-NEXT:    v_mov_b32_e32 v5, s5
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 11
+; GENERIC-NEXT:    v_mov_b32_e32 v5, s4
+; GENERIC-NEXT:    s_cselect_b32 s4, 16, 11
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 10
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s4
+; GENERIC-NEXT:    v_mov_b32_e32 v4, s5
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 10
+; GENERIC-NEXT:    s_cselect_b32 s5, 16, 10
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 9
 ; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_mov_b32_e32 v7, s5
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 9
+; GENERIC-NEXT:    v_mov_b32_e32 v7, s4
+; GENERIC-NEXT:    s_cselect_b32 s4, 16, 9
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 8
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 8
+; GENERIC-NEXT:    v_mov_b32_e32 v6, s5
+; GENERIC-NEXT:    s_cselect_b32 s5, 16, 8
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 15
-; GENERIC-NEXT:    v_mov_b32_e32 v5, s5
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 15
+; GENERIC-NEXT:    v_mov_b32_e32 v5, s4
+; GENERIC-NEXT:    s_cselect_b32 s4, 16, 15
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 14
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 14
+; GENERIC-NEXT:    v_mov_b32_e32 v4, s5
+; GENERIC-NEXT:    s_cselect_b32 s5, 16, 14
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 13
 ; GENERIC-NEXT:    s_cselect_b32 s7, 16, 13
 ; GENERIC-NEXT:    s_cmp_eq_u32 s6, 12
 ; GENERIC-NEXT:    s_cselect_b32 s6, 16, 12
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
 ; GENERIC-NEXT:    s_waitcnt expcnt(0)
-; GENERIC-NEXT:    v_mov_b32_e32 v7, s5
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s4
+; GENERIC-NEXT:    v_mov_b32_e32 v7, s4
+; GENERIC-NEXT:    v_mov_b32_e32 v6, s5
 ; GENERIC-NEXT:    v_mov_b32_e32 v5, s7
 ; GENERIC-NEXT:    v_mov_b32_e32 v4, s6
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 0605a15..a3bd0aa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -163,6 +163,31 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) {
   ret void
 }
 
+; FIXME: Broken
+; define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) {
+;   %x = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src)
+;   call void asm sideeffect "; use $0", "s"(<2 x i64> %x)
+;   ret void
+; }
+
+; define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) {
+;   %x = call <3 x i64> @llvm.amdgcn.readfirstlane.v3i64(<3 x i64> %src)
+;   call void asm sideeffect "; use $0", "s"(<3 x i64> %x)
+;   ret void
+; }
+
+; define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) {
+;   %x = call <4 x i64> @llvm.amdgcn.readfirstlane.v4i64(<4 x i64> %src)
+;   call void asm sideeffect "; use $0", "s"(<4 x i64> %x)
+;   ret void
+; }
+
+; define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) {
+;   %x = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> %src)
+;   call void asm sideeffect "; use $0", "s"(<8 x i64> %x)
+;   ret void
+; }
+
 define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) {
 ; CHECK-SDAG-LABEL: test_readfirstlane_f64:
 ; CHECK-SDAG:       ; %bb.0:
@@ -637,6 +662,536 @@ define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) {
   ret void
 }
 
+define void @test_readfirstlane_v3f32(ptr addrspace(1) %out, <3 x float> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3f32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v3f32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:6]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x float> @llvm.amdgcn.readfirstlane.v3f32(<3 x float> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x float> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v4f32(ptr addrspace(1) %out, <4 x float> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v4f32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:7]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v4f32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:7]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <4 x float> @llvm.amdgcn.readfirstlane.v4f32(<4 x float> %src)
+  call void asm sideeffect "; use $0", "s"(<4 x float> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v8f32(ptr addrspace(1) %out, <8 x float> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v8f32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:11]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v8f32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:11]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <8 x float> @llvm.amdgcn.readfirstlane.v8f32(<8 x float> %src)
+  call void asm sideeffect "; use $0", "s"(<8 x float> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v16f32(ptr addrspace(1) %out, <16 x float> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v16f32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s19, v17
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s18, v16
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s17, v15
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s16, v14
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s15, v13
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s14, v12
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s13, v11
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s12, v10
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:19]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v16f32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s12, v10
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s13, v11
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s14, v12
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s15, v13
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s16, v14
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s17, v15
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s18, v16
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s19, v17
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:19]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> %src)
+  call void asm sideeffect "; use $0", "s"(<16 x float> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v32f32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-SDAG-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s36, 0
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s37, 1
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s38, 2
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s39, 3
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s40, 4
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s41, 5
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s42, 6
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s43, 7
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s44, 8
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s45, 9
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s46, 10
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s47, 11
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s48, 12
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s49, 13
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s50, 14
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s51, 15
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s52, 16
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s53, 17
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s54, 18
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s55, 19
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s56, 20
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s57, 21
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s58, 22
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s59, 23
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s60, 24
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s61, 25
+; CHECK-SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; CHECK-SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s61, v27
+; CHECK-SDAG-NEXT:    buffer_load_dword v27, off, s[0:3], s32
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s62, 26
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s63, 27
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s64, 28
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s65, 29
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s66, 30
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s67, 31
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s64, v30
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s63, v29
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s62, v28
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s60, v26
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s59, v25
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s58, v24
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s57, v23
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s56, v22
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s55, v21
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s54, v20
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s53, v19
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s52, v18
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s51, v17
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s50, v16
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s49, v15
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s48, v14
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s47, v13
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s46, v12
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s45, v11
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s44, v10
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s43, v9
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s42, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s41, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s40, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s39, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s38, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s37, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s36, v2
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s66, v0
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s67, v1
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s65, v27
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[36:67]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_readlane_b32 s67, v31, 31
+; CHECK-SDAG-NEXT:    v_readlane_b32 s66, v31, 30
+; CHECK-SDAG-NEXT:    v_readlane_b32 s65, v31, 29
+; CHECK-SDAG-NEXT:    v_readlane_b32 s64, v31, 28
+; CHECK-SDAG-NEXT:    v_readlane_b32 s63, v31, 27
+; CHECK-SDAG-NEXT:    v_readlane_b32 s62, v31, 26
+; CHECK-SDAG-NEXT:    v_readlane_b32 s61, v31, 25
+; CHECK-SDAG-NEXT:    v_readlane_b32 s60, v31, 24
+; CHECK-SDAG-NEXT:    v_readlane_b32 s59, v31, 23
+; CHECK-SDAG-NEXT:    v_readlane_b32 s58, v31, 22
+; CHECK-SDAG-NEXT:    v_readlane_b32 s57, v31, 21
+; CHECK-SDAG-NEXT:    v_readlane_b32 s56, v31, 20
+; CHECK-SDAG-NEXT:    v_readlane_b32 s55, v31, 19
+; CHECK-SDAG-NEXT:    v_readlane_b32 s54, v31, 18
+; CHECK-SDAG-NEXT:    v_readlane_b32 s53, v31, 17
+; CHECK-SDAG-NEXT:    v_readlane_b32 s52, v31, 16
+; CHECK-SDAG-NEXT:    v_readlane_b32 s51, v31, 15
+; CHECK-SDAG-NEXT:    v_readlane_b32 s50, v31, 14
+; CHECK-SDAG-NEXT:    v_readlane_b32 s49, v31, 13
+; CHECK-SDAG-NEXT:    v_readlane_b32 s48, v31, 12
+; CHECK-SDAG-NEXT:    v_readlane_b32 s47, v31, 11
+; CHECK-SDAG-NEXT:    v_readlane_b32 s46, v31, 10
+; CHECK-SDAG-NEXT:    v_readlane_b32 s45, v31, 9
+; CHECK-SDAG-NEXT:    v_readlane_b32 s44, v31, 8
+; CHECK-SDAG-NEXT:    v_readlane_b32 s43, v31, 7
+; CHECK-SDAG-NEXT:    v_readlane_b32 s42, v31, 6
+; CHECK-SDAG-NEXT:    v_readlane_b32 s41, v31, 5
+; CHECK-SDAG-NEXT:    v_readlane_b32 s40, v31, 4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s39, v31, 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s38, v31, 2
+; CHECK-SDAG-NEXT:    v_readlane_b32 s37, v31, 1
+; CHECK-SDAG-NEXT:    v_readlane_b32 s36, v31, 0
+; CHECK-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v32f32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-GISEL-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s36, 0
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s36, v2
+; CHECK-GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; CHECK-GISEL-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; CHECK-GISEL-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s37, 1
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s38, 2
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s39, 3
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s40, 4
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s41, 5
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s42, 6
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s43, 7
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s44, 8
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s45, 9
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s46, 10
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s47, 11
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s48, 12
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s49, 13
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s50, 14
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s51, 15
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s52, 16
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s53, 17
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s54, 18
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s55, 19
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s56, 20
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s57, 21
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s58, 22
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s59, 23
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s60, 24
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s61, 25
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s62, 26
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s63, 27
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s64, 28
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s65, 29
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s66, 30
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s67, 31
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s37, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s38, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s39, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s40, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s41, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s42, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s43, v9
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s44, v10
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s45, v11
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s46, v12
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s47, v13
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s48, v14
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s49, v15
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s50, v16
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s51, v17
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s52, v18
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s53, v19
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s54, v20
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s55, v21
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s56, v22
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s57, v23
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s58, v24
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s59, v25
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s60, v26
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s61, v27
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s62, v28
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s63, v29
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s64, v30
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s65, v0
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s66, v1
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s67, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[36:67]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_readlane_b32 s67, v31, 31
+; CHECK-GISEL-NEXT:    v_readlane_b32 s66, v31, 30
+; CHECK-GISEL-NEXT:    v_readlane_b32 s65, v31, 29
+; CHECK-GISEL-NEXT:    v_readlane_b32 s64, v31, 28
+; CHECK-GISEL-NEXT:    v_readlane_b32 s63, v31, 27
+; CHECK-GISEL-NEXT:    v_readlane_b32 s62, v31, 26
+; CHECK-GISEL-NEXT:    v_readlane_b32 s61, v31, 25
+; CHECK-GISEL-NEXT:    v_readlane_b32 s60, v31, 24
+; CHECK-GISEL-NEXT:    v_readlane_b32 s59, v31, 23
+; CHECK-GISEL-NEXT:    v_readlane_b32 s58, v31, 22
+; CHECK-GISEL-NEXT:    v_readlane_b32 s57, v31, 21
+; CHECK-GISEL-NEXT:    v_readlane_b32 s56, v31, 20
+; CHECK-GISEL-NEXT:    v_readlane_b32 s55, v31, 19
+; CHECK-GISEL-NEXT:    v_readlane_b32 s54, v31, 18
+; CHECK-GISEL-NEXT:    v_readlane_b32 s53, v31, 17
+; CHECK-GISEL-NEXT:    v_readlane_b32 s52, v31, 16
+; CHECK-GISEL-NEXT:    v_readlane_b32 s51, v31, 15
+; CHECK-GISEL-NEXT:    v_readlane_b32 s50, v31, 14
+; CHECK-GISEL-NEXT:    v_readlane_b32 s49, v31, 13
+; CHECK-GISEL-NEXT:    v_readlane_b32 s48, v31, 12
+; CHECK-GISEL-NEXT:    v_readlane_b32 s47, v31, 11
+; CHECK-GISEL-NEXT:    v_readlane_b32 s46, v31, 10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s45, v31, 9
+; CHECK-GISEL-NEXT:    v_readlane_b32 s44, v31, 8
+; CHECK-GISEL-NEXT:    v_readlane_b32 s43, v31, 7
+; CHECK-GISEL-NEXT:    v_readlane_b32 s42, v31, 6
+; CHECK-GISEL-NEXT:    v_readlane_b32 s41, v31, 5
+; CHECK-GISEL-NEXT:    v_readlane_b32 s40, v31, 4
+; CHECK-GISEL-NEXT:    v_readlane_b32 s39, v31, 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s38, v31, 2
+; CHECK-GISEL-NEXT:    v_readlane_b32 s37, v31, 1
+; CHECK-GISEL-NEXT:    v_readlane_b32 s36, v31, 0
+; CHECK-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-GISEL-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <32 x float> @llvm.amdgcn.readfirstlane.v32f32(<32 x float> %src)
+  call void asm sideeffect "; use $0", "s"(<32 x float> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v2i32(ptr addrspace(1) %out, <2 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v2i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:5]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v2i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:5]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<2 x i32> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v3i32(ptr addrspace(1) %out, <3 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v3i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:6]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v3i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:6]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<3 x i32> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v4i32(ptr addrspace(1) %out, <4 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v4i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:7]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v4i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:7]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<4 x i32> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v5i32(ptr addrspace(1) %out, <5 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v5i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:8]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v5i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:8]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <5 x i32> @llvm.amdgcn.readfirstlane.v5i32(<5 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<5 x i32> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v6i32(ptr addrspace(1) %out, <6 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v6i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:9]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v6i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:9]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <6 x i32> @llvm.amdgcn.readfirstlane.v6i32(<6 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<6 x i32> %x)
+  ret void
+}
+
 define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
 ; CHECK-SDAG-LABEL: test_readfirstlane_v7i32:
 ; CHECK-SDAG:       ; %bb.0:
@@ -672,6 +1227,335 @@ define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
   ret void
 }
 
+define void @test_readfirstlane_v8i32(ptr addrspace(1) %out, <8 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v8i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:11]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v8i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:11]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32(<8 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v16i32(ptr addrspace(1) %out, <16 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v16i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s19, v17
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s18, v16
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s17, v15
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s16, v14
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s15, v13
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s14, v12
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s13, v11
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s12, v10
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:19]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v16i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s12, v10
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s13, v11
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s14, v12
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s15, v13
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s16, v14
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s17, v15
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s18, v16
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s19, v17
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:19]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <16 x i32> @llvm.amdgcn.readfirstlane.v16i32(<16 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v32i32:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-SDAG-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s36, 0
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s37, 1
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s38, 2
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s39, 3
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s40, 4
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s41, 5
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s42, 6
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s43, 7
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s44, 8
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s45, 9
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s46, 10
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s47, 11
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s48, 12
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s49, 13
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s50, 14
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s51, 15
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s52, 16
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s53, 17
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s54, 18
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s55, 19
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s56, 20
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s57, 21
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s58, 22
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s59, 23
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s60, 24
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s61, 25
+; CHECK-SDAG-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; CHECK-SDAG-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s61, v27
+; CHECK-SDAG-NEXT:    buffer_load_dword v27, off, s[0:3], s32
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s62, 26
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s63, 27
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s64, 28
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s65, 29
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s66, 30
+; CHECK-SDAG-NEXT:    v_writelane_b32 v31, s67, 31
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s64, v30
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s63, v29
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s62, v28
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s60, v26
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s59, v25
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s58, v24
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s57, v23
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s56, v22
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s55, v21
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s54, v20
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s53, v19
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s52, v18
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s51, v17
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s50, v16
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s49, v15
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s48, v14
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s47, v13
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s46, v12
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s45, v11
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s44, v10
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s43, v9
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s42, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s41, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s40, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s39, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s38, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s37, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s36, v2
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s66, v0
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s67, v1
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s65, v27
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[36:67]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_readlane_b32 s67, v31, 31
+; CHECK-SDAG-NEXT:    v_readlane_b32 s66, v31, 30
+; CHECK-SDAG-NEXT:    v_readlane_b32 s65, v31, 29
+; CHECK-SDAG-NEXT:    v_readlane_b32 s64, v31, 28
+; CHECK-SDAG-NEXT:    v_readlane_b32 s63, v31, 27
+; CHECK-SDAG-NEXT:    v_readlane_b32 s62, v31, 26
+; CHECK-SDAG-NEXT:    v_readlane_b32 s61, v31, 25
+; CHECK-SDAG-NEXT:    v_readlane_b32 s60, v31, 24
+; CHECK-SDAG-NEXT:    v_readlane_b32 s59, v31, 23
+; CHECK-SDAG-NEXT:    v_readlane_b32 s58, v31, 22
+; CHECK-SDAG-NEXT:    v_readlane_b32 s57, v31, 21
+; CHECK-SDAG-NEXT:    v_readlane_b32 s56, v31, 20
+; CHECK-SDAG-NEXT:    v_readlane_b32 s55, v31, 19
+; CHECK-SDAG-NEXT:    v_readlane_b32 s54, v31, 18
+; CHECK-SDAG-NEXT:    v_readlane_b32 s53, v31, 17
+; CHECK-SDAG-NEXT:    v_readlane_b32 s52, v31, 16
+; CHECK-SDAG-NEXT:    v_readlane_b32 s51, v31, 15
+; CHECK-SDAG-NEXT:    v_readlane_b32 s50, v31, 14
+; CHECK-SDAG-NEXT:    v_readlane_b32 s49, v31, 13
+; CHECK-SDAG-NEXT:    v_readlane_b32 s48, v31, 12
+; CHECK-SDAG-NEXT:    v_readlane_b32 s47, v31, 11
+; CHECK-SDAG-NEXT:    v_readlane_b32 s46, v31, 10
+; CHECK-SDAG-NEXT:    v_readlane_b32 s45, v31, 9
+; CHECK-SDAG-NEXT:    v_readlane_b32 s44, v31, 8
+; CHECK-SDAG-NEXT:    v_readlane_b32 s43, v31, 7
+; CHECK-SDAG-NEXT:    v_readlane_b32 s42, v31, 6
+; CHECK-SDAG-NEXT:    v_readlane_b32 s41, v31, 5
+; CHECK-SDAG-NEXT:    v_readlane_b32 s40, v31, 4
+; CHECK-SDAG-NEXT:    v_readlane_b32 s39, v31, 3
+; CHECK-SDAG-NEXT:    v_readlane_b32 s38, v31, 2
+; CHECK-SDAG-NEXT:    v_readlane_b32 s37, v31, 1
+; CHECK-SDAG-NEXT:    v_readlane_b32 s36, v31, 0
+; CHECK-SDAG-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-SDAG-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v32i32:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-GISEL-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s36, 0
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s36, v2
+; CHECK-GISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; CHECK-GISEL-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; CHECK-GISEL-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s37, 1
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s38, 2
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s39, 3
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s40, 4
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s41, 5
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s42, 6
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s43, 7
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s44, 8
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s45, 9
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s46, 10
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s47, 11
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s48, 12
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s49, 13
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s50, 14
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s51, 15
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s52, 16
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s53, 17
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s54, 18
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s55, 19
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s56, 20
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s57, 21
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s58, 22
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s59, 23
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s60, 24
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s61, 25
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s62, 26
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s63, 27
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s64, 28
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s65, 29
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s66, 30
+; CHECK-GISEL-NEXT:    v_writelane_b32 v31, s67, 31
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s37, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s38, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s39, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s40, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s41, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s42, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s43, v9
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s44, v10
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s45, v11
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s46, v12
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s47, v13
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s48, v14
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s49, v15
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s50, v16
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s51, v17
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s52, v18
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s53, v19
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s54, v20
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s55, v21
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s56, v22
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s57, v23
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s58, v24
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s59, v25
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s60, v26
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s61, v27
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s62, v28
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s63, v29
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s64, v30
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s65, v0
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s66, v1
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s67, v2
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[36:67]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_readlane_b32 s67, v31, 31
+; CHECK-GISEL-NEXT:    v_readlane_b32 s66, v31, 30
+; CHECK-GISEL-NEXT:    v_readlane_b32 s65, v31, 29
+; CHECK-GISEL-NEXT:    v_readlane_b32 s64, v31, 28
+; CHECK-GISEL-NEXT:    v_readlane_b32 s63, v31, 27
+; CHECK-GISEL-NEXT:    v_readlane_b32 s62, v31, 26
+; CHECK-GISEL-NEXT:    v_readlane_b32 s61, v31, 25
+; CHECK-GISEL-NEXT:    v_readlane_b32 s60, v31, 24
+; CHECK-GISEL-NEXT:    v_readlane_b32 s59, v31, 23
+; CHECK-GISEL-NEXT:    v_readlane_b32 s58, v31, 22
+; CHECK-GISEL-NEXT:    v_readlane_b32 s57, v31, 21
+; CHECK-GISEL-NEXT:    v_readlane_b32 s56, v31, 20
+; CHECK-GISEL-NEXT:    v_readlane_b32 s55, v31, 19
+; CHECK-GISEL-NEXT:    v_readlane_b32 s54, v31, 18
+; CHECK-GISEL-NEXT:    v_readlane_b32 s53, v31, 17
+; CHECK-GISEL-NEXT:    v_readlane_b32 s52, v31, 16
+; CHECK-GISEL-NEXT:    v_readlane_b32 s51, v31, 15
+; CHECK-GISEL-NEXT:    v_readlane_b32 s50, v31, 14
+; CHECK-GISEL-NEXT:    v_readlane_b32 s49, v31, 13
+; CHECK-GISEL-NEXT:    v_readlane_b32 s48, v31, 12
+; CHECK-GISEL-NEXT:    v_readlane_b32 s47, v31, 11
+; CHECK-GISEL-NEXT:    v_readlane_b32 s46, v31, 10
+; CHECK-GISEL-NEXT:    v_readlane_b32 s45, v31, 9
+; CHECK-GISEL-NEXT:    v_readlane_b32 s44, v31, 8
+; CHECK-GISEL-NEXT:    v_readlane_b32 s43, v31, 7
+; CHECK-GISEL-NEXT:    v_readlane_b32 s42, v31, 6
+; CHECK-GISEL-NEXT:    v_readlane_b32 s41, v31, 5
+; CHECK-GISEL-NEXT:    v_readlane_b32 s40, v31, 4
+; CHECK-GISEL-NEXT:    v_readlane_b32 s39, v31, 3
+; CHECK-GISEL-NEXT:    v_readlane_b32 s38, v31, 2
+; CHECK-GISEL-NEXT:    v_readlane_b32 s37, v31, 1
+; CHECK-GISEL-NEXT:    v_readlane_b32 s36, v31, 0
+; CHECK-GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-GISEL-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <32 x i32> @llvm.amdgcn.readfirstlane.v32i32(<32 x i32> %src)
+  call void asm sideeffect "; use $0", "s"(<32 x i32> %x)
+  ret void
+}
+
 define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
 ; CHECK-SDAG-LABEL: test_readfirstlane_v8i16:
 ; CHECK-SDAG:       ; %bb.0:
@@ -700,3 +1584,148 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
   call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
   ret void
 }
+
+define void @test_readfirstlane_v16i16(ptr addrspace(1) %out, <16 x i16> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v16i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:11]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v16i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:11]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <16 x i16> @llvm.amdgcn.readfirstlane.v16i16(<16 x i16> %src)
+  call void asm sideeffect "; use $0", "s"(<16 x i16> %x)
+  ret void
+}
+
+define void @test_readfirstlane_v32i16(ptr addrspace(1) %out, <32 x i16> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v32i16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s19, v17
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s18, v16
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s17, v15
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s16, v14
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s15, v13
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s14, v12
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s13, v11
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s12, v10
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:19]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v32i16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s12, v10
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s13, v11
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s14, v12
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s15, v13
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s16, v14
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s17, v15
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s18, v16
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s19, v17
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:19]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <32 x i16> @llvm.amdgcn.readfirstlane.v32i16(<32 x i16> %src)
+  call void asm sideeffect "; use $0", "s"(<32 x i16> %x)
+  ret void
+}
+
+
+define void @test_readfirstlane_v32f16(ptr addrspace(1) %out, <32 x half> %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_v32f16:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s19, v17
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s18, v16
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s17, v15
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s16, v14
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s15, v13
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s14, v12
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s13, v11
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s12, v10
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT:    ;;#ASMSTART
+; CHECK-SDAG-NEXT:    ; use s[4:19]
+; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_v32f16:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s11, v9
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s12, v10
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s13, v11
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s14, v12
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s15, v13
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s16, v14
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s17, v15
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s18, v16
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s19, v17
+; CHECK-GISEL-NEXT:    ;;#ASMSTART
+; CHECK-GISEL-NEXT:    ; use s[4:19]
+; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %x = call <32 x half> @llvm.amdgcn.readfirstlane.v32f16(<32 x half> %src)
+  call void asm sideeffect "; use $0", "s"(<32 x half> %x)
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 9540aa3..0071842b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -537,13 +537,12 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v4.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v2.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -649,11 +648,8 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v1.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -793,15 +789,14 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v5, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v4, 0xffff8000, v4, v5
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v5
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v5
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v4.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v6.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v3.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -923,12 +918,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
 ; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v3f16_v3i16:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v4.l, v5.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v2.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -1097,19 +1089,17 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v6, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v6
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v6
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v4, 0xffff8000, v4, v6
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v5, 0xffff8000, v5, v6
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v7.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v3.l
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v4.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v8.l, v5.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v1.h, v5.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1264,15 +1254,10 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
 ; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v4f16_v4i16:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v2.h
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v4.l, v6.l
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v5.l, v7.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v1.h, v3.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index b850428..87a659d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -14,30 +14,31 @@
 define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; SI-SDAG-LABEL: s_log_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_load_dword s0, s[4:5], 0xb
-; SI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT:    s_mov_b32 s1, 0x3377d1cf
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, s0, v0
-; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT:    s_mov_b32 s0, 0x3f317217
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
-; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-SDAG-NEXT:    v_fma_f32 v2, v0, s0, -v1
-; SI-SDAG-NEXT:    v_fma_f32 v2, v0, s1, v2
-; SI-SDAG-NEXT:    s_mov_b32 s0, 0x7f800000
-; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-SDAG-NEXT:    v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
+; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s4, -v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3377d1cf
+; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s4, v3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: s_log_f32:
@@ -70,32 +71,34 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; VI-SDAG-LABEL: s_log_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; VI-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, s0, v0
-; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT:    s_mov_b32 s0, 0x7f800000
-; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v3, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v4, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
+; VI-SDAG-NEXT:    v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_mov_b32 s2, 0x7f800000
+; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v2
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v3
+; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v3, v5, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s2
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; VI-SDAG-NEXT:    s_endpgm
 ;
@@ -132,29 +135,30 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; GFX900-SDAG-LABEL: s_log_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; GFX900-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-SDAG-NEXT:    s_mov_b32 s1, 0x3377d1cf
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX900-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX900-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s6, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    s_mov_b32 s0, 0x3f317217
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v3, v1, s0, -v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v3, v1, s1, v3
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x41b17218
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x3f317217
+; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0x3377d1cf
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v1
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s2, -v3
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s3, v4
+; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x7f800000
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-SDAG-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log_f32:
@@ -188,26 +192,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x41b17218, s1
+; GFX1100-SDAG-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s0, s1
 ; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
+; GFX1100-SDAG-NEXT:    v_fma_f32 v3, 0x3f317217, v1, -v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v3, v1, 0x3377d1cf, v3
+; GFX1100-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
+; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-SDAG-NEXT:    s_endpgm
@@ -316,44 +319,46 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; SI-SDAG-LABEL: s_log_v2f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    s_mov_b32 s8, 0x3377d1cf
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0x7f800000
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s3, v1
-; SI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; SI-SDAG-NEXT:    s_mov_b32 s3, 0x3f317217
-; SI-SDAG-NEXT:    s_mov_b32 s4, s0
-; SI-SDAG-NEXT:    s_mov_b32 s5, s1
-; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s3, -v2
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s8, v3
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s0
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s7, v3
+; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT:    s_mov_b32 s0, s4
+; SI-SDAG-NEXT:    s_mov_b32 s1, s5
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3f317217
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v3
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s7, -v4
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s8, v5
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; SI-SDAG-NEXT:    v_log_f32_e32 v5, v1
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v5
+; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s7, -v2
+; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s8, v3
 ; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s9
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, s2, v0
-; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x41b17218
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
-; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; SI-SDAG-NEXT:    v_fma_f32 v4, v0, s3, -v3
-; SI-SDAG-NEXT:    v_fma_f32 v4, v0, s8, v4
-; SI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
-; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v2
-; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
+; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: s_log_v2f32:
@@ -398,49 +403,51 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; VI-SDAG-LABEL: s_log_v2f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    s_mov_b32 s2, 0x7f800000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s7, v1
-; VI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v3
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
+; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v3
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v5
+; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v5
+; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; VI-SDAG-NEXT:    v_log_f32_e32 v5, v1
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
+; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v5
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v5, v2
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v3
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v3
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v3, v6, v3
 ; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v5, v3
 ; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s2
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, s6, v0
-; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x41b17218
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-SDAG-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v4, v6, v4
-; VI-SDAG-NEXT:    v_add_f32_e32 v4, v5, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s2
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
@@ -494,41 +501,43 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; GFX900-SDAG-LABEL: s_log_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x3f317217
-; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0x3377d1cf
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x3377d1cf
+; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x7f800000
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s11, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s11, v1
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s2, -v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s3, v4
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, s4
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v4, s3, v4
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v4, v4
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0x3f317217
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v4
+; GFX900-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v4, s3, -v5
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v4, s6, v6
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v6, v1
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v4|, s7
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, v6, s3, -v3
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, v6, s6, v4
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s10, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, s10, v0
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41b17218
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v5, v0, s2, -v4
-; GFX900-SDAG-NEXT:    v_fma_f32 v5, v0, s3, v5
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v3
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s7
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v0
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log_v2f32:
@@ -574,39 +583,37 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s3
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x41b17218, s4
+; GFX1100-SDAG-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x41b17218, s5
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s3, s4
+; GFX1100-SDAG-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s5, 32, 0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v3, s2, s5
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v3, 0x3f317217, v1 :: v_dual_lshlrev_b32 v0, 5, v0
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, s3, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v5, 0x3f317217, v1, -v3
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v5, 0x3377d1cf, v1
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3f317217, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
-; GFX1100-SDAG-NEXT:    v_fma_f32 v4, 0x3f317217, v0, -v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v4, 0x3377d1cf, v0
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s4
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v1
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0
-; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5
-; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v3
+; GFX1100-SDAG-NEXT:    v_fma_f32 v6, 0x3f317217, v1, -v4
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fma_f32 v7, 0x3f317217, v3, -v5
+; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v6, 0x3377d1cf, v1 :: v_dual_fmac_f32 v7, 0x3377d1cf, v3
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_dual_add_f32 v4, v4, v6 :: v_dual_add_f32 v5, v5, v7
+; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_mov_b32 v4, 0
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2
+; GFX1100-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log_v2f32:
@@ -762,56 +769,59 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; SI-SDAG-LABEL: s_log_v3f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
-; SI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x41b17218
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3f317217
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s11, 0x3377d1cf
-; SI-SDAG-NEXT:    s_mov_b32 s12, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s9, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s9, v1
-; SI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; SI-SDAG-NEXT:    s_mov_b32 s9, 0x3f317217
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
-; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s9, -v2
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s11, v3
-; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s12
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s8, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v2, s8, v2
-; SI-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41b17218
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s9, v3
+; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v3
+; SI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s6, -v4
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3377d1cf
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v6, s8, v6
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s7, v5
+; SI-SDAG-NEXT:    s_mov_b32 s9, 0x7f800000
+; SI-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v2, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v4
-; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v2
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_fma_f32 v5, v2, s9, -v4
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; SI-SDAG-NEXT:    v_fma_f32 v5, v2, s11, v5
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v3, v1
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v6
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_fma_f32 v4, v6, s6, -v3
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; SI-SDAG-NEXT:    v_fma_f32 v4, v6, s7, v4
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, s10, v0
-; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
-; SI-SDAG-NEXT:    v_log_f32_e32 v5, v0
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v2|, s12
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[2:3]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
-; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v2
-; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v5
-; SI-SDAG-NEXT:    v_fma_f32 v4, v5, s9, -v2
-; SI-SDAG-NEXT:    v_fma_f32 v4, v5, s11, v4
-; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v5|, s12
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; SI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
-; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
+; SI-SDAG-NEXT:    v_log_f32_e32 v4, v0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v4
+; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
+; SI-SDAG-NEXT:    v_fma_f32 v5, v4, s6, -v3
+; SI-SDAG-NEXT:    v_fma_f32 v5, v4, s7, v5
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v5
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v4|, s9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    v_sub_f32_e32 v2, v3, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
+; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: s_log_v3f32:
@@ -871,55 +881,59 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ;
 ; VI-SDAG-LABEL: s_log_v3f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0x7f800000
-; VI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s10, v1
-; VI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v5, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; VI-SDAG-NEXT:    v_ldexp_f32 v2, s9, v2
-; VI-SDAG-NEXT:    v_log_f32_e32 v3, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x41b17218
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v2
-; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v1
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v5
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; VI-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v3
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v4
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v5
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, s8, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
-; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
+; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v5
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[2:3]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v4, s[0:1]
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-SDAG-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v0, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
+; VI-SDAG-NEXT:    v_ldexp_f32 v6, s1, v6
+; VI-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0x7f800000
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v3, v2
+; VI-SDAG-NEXT:    v_and_b32_e32 v3, 0xfffff000, v6
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v6, v3
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v3
+; VI-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v8, v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v7, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
+; VI-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
+; VI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-SDAG-NEXT:    v_log_f32_e32 v4, v1
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v5
+; VI-SDAG-NEXT:    v_and_b32_e32 v3, 0xfffff000, v4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v4, v3
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v3
@@ -927,12 +941,12 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
 ; VI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v5
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v3
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s4
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, s5
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v4|, s6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, s3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-SDAG-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
@@ -1005,55 +1019,58 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ;
 ; GFX900-SDAG-LABEL: s_log_v3f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GFX900-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v3
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
-; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s10, v1
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    s_mov_b32 s10, 0x7f800000
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0x41b17218
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s4, -v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s5, v4
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s10
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, s9, v2
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v4, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v2
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v4
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, s8, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v4, s4, -v1
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v4, s5, v6
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v6
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v4|, s10
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[2:3]
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v0, s4, -v4
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v0, s5, v6
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, v4, v6
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s10
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v5, vcc
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX900-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v3, s4, -v5
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v7, s1, v7
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v3, s5, v6
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s8
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v7
+; GFX900-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX900-SDAG-NEXT:    v_fma_f32 v5, v7, s4, -v3
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX900-SDAG-NEXT:    v_fma_f32 v5, v7, s5, v5
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v5, v1
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s8
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v6
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v5
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v5, s4, -v3
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v5, s5, v6
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s8
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v0
+; GFX900-SDAG-NEXT:    global_store_dwordx3 v4, v[0:2], s[6:7]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log_v3f32:
@@ -1113,60 +1130,52 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s2
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, 0x41b17218, s6
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x41b17218, s3
+; GFX1100-SDAG-NEXT:    s_and_b32 s3, s3, exec_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s6
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s2, s3
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v4, s1, s6
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 0x41b17218, s7
+; GFX1100-SDAG-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v2 :: v_dual_lshlrev_b32 v0, 5, v0
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, s2, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v8, 0x3f317217, v2, -v5
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v8, 0x3377d1cf, v2
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v5, v5, v8
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v6, 0x3f317217, v0, -v3
-; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v6, 0x3377d1cf, v0 :: v_dual_lshlrev_b32 v1, 5, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v4, v4
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v5, s0, s2
 ; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v5, v5
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v1
-; GFX1100-SDAG-NEXT:    v_fma_f32 v7, 0x3f317217, v1, -v4
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v7, 0x3377d1cf, v1
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v4, v4, v7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v6, 0x3f317217, v2 :: v_dual_mul_f32 v7, 0x3f317217, v4
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-SDAG-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v10
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v5, vcc_lo
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v9
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v6
+; GFX1100-SDAG-NEXT:    v_fma_f32 v9, 0x3f317217, v2, -v6
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fma_f32 v10, 0x3f317217, v4, -v7
+; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v9, 0x3377d1cf, v2 :: v_dual_fmac_f32 v10, 0x3377d1cf, v4
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v5
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_dual_add_f32 v6, v6, v9 :: v_dual_add_f32 v7, v7, v10
+; GFX1100-SDAG-NEXT:    v_fma_f32 v11, 0x3f317217, v5, -v8
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v4|
+; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v11, 0x3377d1cf, v5 :: v_dual_sub_f32 v2, v2, v0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_add_f32_e32 v8, v8, v11
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v5|
+; GFX1100-SDAG-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_sub_f32 v1, v4, v1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v5, v3
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT:    global_store_b96 v6, v[0:2], s[0:1]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log_v3f32:
@@ -1387,68 +1396,72 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; SI-SDAG-LABEL: s_log_v4f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
-; SI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT:    s_mov_b32 s12, 0x3377d1cf
-; SI-SDAG-NEXT:    s_mov_b32 s13, 0x7f800000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3f317217
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s11, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s11, v1
-; SI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; SI-SDAG-NEXT:    s_mov_b32 s11, 0x3f317217
-; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x41b17218
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s11, -v2
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s12, v3
-; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s13
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s10, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v2, s10, v2
-; SI-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v3
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s11, v3
+; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v3
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s6, -v4
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3377d1cf
+; SI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s7, v5
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0x7f800000
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v6, s10, v6
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v1, vcc
+; SI-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s9, v0
-; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v2
-; SI-SDAG-NEXT:    v_fma_f32 v5, v2, s11, -v1
-; SI-SDAG-NEXT:    v_fma_f32 v5, v2, s12, v5
-; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v5
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v5, s9, v5
-; SI-SDAG-NEXT:    v_log_f32_e32 v5, v5
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v2|, s13
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s8, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; SI-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v2
-; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v5
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, s8, v0
-; SI-SDAG-NEXT:    v_fma_f32 v6, v5, s11, -v1
-; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT:    v_fma_f32 v6, v5, s12, v6
-; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v6
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v5|, s13
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[2:3]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v5
-; SI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v0
-; SI-SDAG-NEXT:    v_fma_f32 v6, v0, s11, -v5
-; SI-SDAG-NEXT:    v_fma_f32 v6, v0, s12, v6
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v6
+; SI-SDAG-NEXT:    v_mov_b32_e32 v7, s4
+; SI-SDAG-NEXT:    v_fma_f32 v4, v6, s6, -v2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v7, s9, v7
+; SI-SDAG-NEXT:    v_fma_f32 v4, v6, s7, v4
+; SI-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v5
+; SI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v7
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_fma_f32 v6, v7, s6, -v5
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; SI-SDAG-NEXT:    v_fma_f32 v6, v7, s7, v6
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s8, v1
 ; SI-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s13
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
-; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
-; SI-SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-SDAG-NEXT:    v_log_f32_e32 v6, v1
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v4
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v6
+; SI-SDAG-NEXT:    v_fma_f32 v5, v6, s6, -v4
+; SI-SDAG-NEXT:    v_fma_f32 v5, v6, s7, v5
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v4, v0
+; SI-SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: s_log_v4f32:
@@ -1520,84 +1533,88 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ;
 ; VI-SDAG-LABEL: s_log_v4f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0x7f800000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; VI-SDAG-NEXT:    s_mov_b32 s8, 0x7f800000
 ; VI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s11, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s11, v1
-; VI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v5, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s10, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; VI-SDAG-NEXT:    v_ldexp_f32 v2, s10, v2
-; VI-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x41b17218
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v3
-; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v2
-; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v2, v1
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v5
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s6
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
+; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v3
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v5
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v1
-; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s9, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v5
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v5, s9, v5
-; VI-SDAG-NEXT:    v_log_f32_e32 v5, v5
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v2|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s8, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v2
-; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v5
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, v5, v1
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, s8, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v6, v8, v6
-; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v6, v7, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v6
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v5|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[2:3]
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v5
-; VI-SDAG-NEXT:    v_and_b32_e32 v5, 0xfffff000, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, v0, v5
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v6
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s3
+; VI-SDAG-NEXT:    v_ldexp_f32 v6, s2, v6
+; VI-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s8
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v2
+; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v6
+; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v6, v2
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v2
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v8, v4
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v7, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT:    v_ldexp_f32 v7, s1, v7
+; VI-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s8
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v5
+; VI-SDAG-NEXT:    v_and_b32_e32 v5, 0xfffff000, v7
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; VI-SDAG-NEXT:    v_sub_f32_e32 v6, v7, v5
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3f317000, v6
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v5
+; VI-SDAG-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v5
+; VI-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v6, v9, v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-SDAG-NEXT:    v_add_f32_e32 v6, v8, v6
-; VI-SDAG-NEXT:    v_add_f32_e32 v6, v7, v6
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
+; VI-SDAG-NEXT:    v_log_f32_e32 v6, v1
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s8
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v4
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v6
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v6, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v5
+; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v5
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v8, v5
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s8
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-SDAG-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1690,67 +1707,71 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ;
 ; GFX900-SDAG-LABEL: s_log_v4f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GFX900-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
-; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0x3f317217
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s11, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s11, v1
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    s_mov_b32 s11, 0x7f800000
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0x41b17218
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v3, v1, s4, -v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v3, v1, s5, v3
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s11
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s10, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, s10, v2
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v3
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s9, v0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v2, s4, -v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v2, s5, v6
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v6
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v6, s9, v6
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v6, v6
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v2|, s11
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v5, s[0:1]
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s8, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v2
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v6
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, s8, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v7, v6, s4, -v1
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v7, v6, s5, v7
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v7
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, s11
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v6, v1, s[2:3]
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v6
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v7, v0, s4, -v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v7, v0, s5, v7
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v3
+; GFX900-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0x3377d1cf
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v3, s8, -v5
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, s3
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v3, s9, v6
+; GFX900-SDAG-NEXT:    s_mov_b32 s10, 0x7f800000
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v7, s2, v7
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s10
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v2
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v7
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, s2
+; GFX900-SDAG-NEXT:    v_fma_f32 v5, v7, s8, -v2
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v8, s1, v8
+; GFX900-SDAG-NEXT:    v_fma_f32 v5, v7, s9, v5
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v8, v8
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s10
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v8
+; GFX900-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX900-SDAG-NEXT:    v_fma_f32 v7, v8, s8, -v6
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX900-SDAG-NEXT:    v_fma_f32 v7, v8, s9, v7
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s11
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[0:1]
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v7, v1
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v8|, s10
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v5
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v7
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v7, s8, -v5
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v7, s9, v6
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s10
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v5, v0
 ; GFX900-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
@@ -1824,68 +1845,61 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s1
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s3
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s6
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s1, v2
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v3, s0, v3
-; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s1
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x41b17218, s6
+; GFX1100-SDAG-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s7
+; GFX1100-SDAG-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s3, s6
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s8
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s7, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s6, s9, exec_lo
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v3, s2, s7
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v6, s1, s3
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v7, s0, s2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v3, v3
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s9
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v8, 0x3f317217, v2 :: v_dual_mul_f32 v9, 0x3f317217, v3
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v2
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, s3, v0
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v12, 0x3f317217, v2, -v7
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v13, 0x3f317217, v3, -v8
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v10, 0x3f317217, v6 :: v_dual_mul_f32 v11, 0x3f317217, v7
+; GFX1100-SDAG-NEXT:    v_fma_f32 v12, 0x3f317217, v2, -v8
+; GFX1100-SDAG-NEXT:    v_fma_f32 v13, 0x3f317217, v3, -v9
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1100-SDAG-NEXT:    v_fma_f32 v14, 0x3f317217, v6, -v10
+; GFX1100-SDAG-NEXT:    v_fma_f32 v15, 0x3f317217, v7, -v11
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v7, v7, v12
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_add_f32 v8, v8, v13
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v1
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v10, 0x3f317217, v0, -v5
-; GFX1100-SDAG-NEXT:    v_fma_f32 v11, 0x3f317217, v1, -v6
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v10, 0x3377d1cf, v0 :: v_dual_fmac_f32 v11, 0x3377d1cf, v1
-; GFX1100-SDAG-NEXT:    v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v5, v2, v7, vcc_lo
+; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v14, 0x3377d1cf, v6 :: v_dual_fmac_f32 v15, 0x3377d1cf, v7
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_dual_add_f32 v8, v8, v12 :: v_dual_add_f32 v9, v9, v13
+; GFX1100-SDAG-NEXT:    v_dual_add_f32 v10, v10, v14 :: v_dual_add_f32 v11, v11, v15
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_sub_f32 v2, v1, v9
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc_lo
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v3, v0, v4
-; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
+; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v8, v3, v9 :: v_dual_mov_b32 v9, 0
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v6|
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v7|
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    global_store_b128 v7, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT:    global_store_b128 v9, v[0:3], s[0:1]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log_v4f32:
@@ -2143,8 +2157,7 @@ define float @v_log_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -2189,8 +2202,7 @@ define float @v_log_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -2241,8 +2253,7 @@ define float @v_log_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -2286,22 +2297,21 @@ define float @v_log_f32(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2347,8 +2357,7 @@ define float @v_log_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -2393,8 +2402,7 @@ define float @v_log_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -2445,8 +2453,7 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -2491,22 +2498,20 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, s0
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2554,8 +2559,7 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -2600,8 +2604,7 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -2652,8 +2655,7 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -2698,22 +2700,20 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e64 s0, 0x80800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, s0
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, -|v0|, v1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2762,8 +2762,7 @@ define float @v_log_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -2808,8 +2807,7 @@ define float @v_log_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -2860,8 +2858,7 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -2905,22 +2902,21 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, -v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2968,8 +2964,7 @@ define float @v_log_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -2995,8 +2990,7 @@ define float @v_log_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3022,8 +3016,7 @@ define float @v_log_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3048,12 +3041,10 @@ define float @v_log_f32_fast(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
@@ -3088,8 +3079,7 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3115,8 +3105,7 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3142,8 +3131,7 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3168,12 +3156,10 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
@@ -3208,8 +3194,7 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3235,8 +3220,7 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3262,8 +3246,7 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3288,12 +3271,10 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
@@ -3328,8 +3309,7 @@ define float @v_log_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -3374,8 +3354,7 @@ define float @v_log_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -3426,8 +3405,7 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -3471,22 +3449,21 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3532,8 +3509,7 @@ define float @v_log_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3559,8 +3535,7 @@ define float @v_log_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3586,8 +3561,7 @@ define float @v_log_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3612,12 +3586,10 @@ define float @v_log_f32_afn(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
@@ -3681,8 +3653,7 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3708,8 +3679,7 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3735,8 +3705,7 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3761,12 +3730,10 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
@@ -3801,8 +3768,7 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3828,8 +3794,7 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3855,8 +3820,7 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc1b17218
@@ -3882,11 +3846,10 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, s0
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
@@ -4063,8 +4026,7 @@ define float @v_log_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -4109,8 +4071,7 @@ define float @v_log_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -4161,8 +4122,7 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -4206,22 +4166,21 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4407,8 +4366,7 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -4453,8 +4411,7 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -4505,8 +4462,7 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -4550,22 +4506,21 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4751,8 +4706,7 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -4797,8 +4751,7 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -4849,8 +4802,7 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -4894,22 +4846,21 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4955,8 +4906,7 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -4995,8 +4945,7 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5041,8 +4990,7 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -5080,20 +5028,18 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5240,8 +5186,7 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -5280,8 +5225,7 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5326,8 +5270,7 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -5365,20 +5308,18 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5451,8 +5392,7 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -5497,8 +5437,7 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -5549,8 +5488,7 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -5594,22 +5532,21 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6132,8 +6069,7 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
 ; SI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -6302,8 +6238,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_log_f32_e32 v0, v0
 ; SI-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -6326,8 +6261,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-NEXT:    v_log_f32_e32 v0, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -6353,8 +6287,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -6377,22 +6310,20 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index d09df75..d7cefd6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -14,30 +14,31 @@
 define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; SI-SDAG-LABEL: s_log10_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_load_dword s0, s[4:5], 0xb
-; SI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT:    s_mov_b32 s1, 0x3284fbcf
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, s0, v0
-; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT:    s_mov_b32 s0, 0x3e9a209a
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
-; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-SDAG-NEXT:    v_fma_f32 v2, v0, s0, -v1
-; SI-SDAG-NEXT:    v_fma_f32 v2, v0, s1, v2
-; SI-SDAG-NEXT:    s_mov_b32 s0, 0x7f800000
-; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-SDAG-NEXT:    v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
+; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s4, -v2
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3284fbcf
+; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s4, v3
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: s_log10_f32:
@@ -70,32 +71,34 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; VI-SDAG-LABEL: s_log10_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; VI-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, s0, v0
-; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT:    s_mov_b32 s0, 0x7f800000
-; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v3, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v4, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
+; VI-SDAG-NEXT:    v_log_f32_e32 v1, v1
+; VI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_mov_b32 s2, 0x7f800000
+; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v2
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v3
+; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v3, v5, v3
+; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s2
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; VI-SDAG-NEXT:    s_endpgm
 ;
@@ -132,29 +135,30 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; GFX900-SDAG-LABEL: s_log10_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; GFX900-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-SDAG-NEXT:    s_mov_b32 s1, 0x3284fbcf
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX900-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX900-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s6, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    s_mov_b32 s0, 0x3e9a209a
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v3, v1, s0, -v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v3, v1, s1, v3
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x411a209b
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x3e9a209a
+; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0x3284fbcf
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v1
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s2, -v3
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s3, v4
+; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x7f800000
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-SDAG-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log10_f32:
@@ -188,26 +192,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x411a209b, s1
+; GFX1100-SDAG-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s0, s1
 ; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
+; GFX1100-SDAG-NEXT:    v_fma_f32 v3, 0x3e9a209a, v1, -v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v3, v1, 0x3284fbcf, v3
+; GFX1100-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
+; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-SDAG-NEXT:    s_endpgm
@@ -316,44 +319,46 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; SI-SDAG-LABEL: s_log10_v2f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    s_mov_b32 s8, 0x3284fbcf
 ; SI-SDAG-NEXT:    s_mov_b32 s9, 0x7f800000
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s3, v1
-; SI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; SI-SDAG-NEXT:    s_mov_b32 s3, 0x3e9a209a
-; SI-SDAG-NEXT:    s_mov_b32 s4, s0
-; SI-SDAG-NEXT:    s_mov_b32 s5, s1
-; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s3, -v2
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s8, v3
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s0
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s7, v3
+; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT:    s_mov_b32 s0, s4
+; SI-SDAG-NEXT:    s_mov_b32 s1, s5
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3e9a209a
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v3
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s7, -v4
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s8, v5
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; SI-SDAG-NEXT:    v_log_f32_e32 v5, v1
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v5
+; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s7, -v2
+; SI-SDAG-NEXT:    v_fma_f32 v3, v5, s8, v3
 ; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s9
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, s2, v0
-; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x411a209b
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
-; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; SI-SDAG-NEXT:    v_fma_f32 v4, v0, s3, -v3
-; SI-SDAG-NEXT:    v_fma_f32 v4, v0, s8, v4
-; SI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s9
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
-; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v2
-; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
+; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: s_log10_v2f32:
@@ -398,49 +403,51 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; VI-SDAG-LABEL: s_log10_v2f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    s_mov_b32 s2, 0x7f800000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s7, v1
-; VI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v3
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
+; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v3
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v5
+; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v5
+; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; VI-SDAG-NEXT:    v_log_f32_e32 v5, v1
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v2
+; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v5
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v5, v2
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v3
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v3
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v3, v6, v3
 ; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v5, v3
 ; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s2
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, s6, v0
-; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x411a209b
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-SDAG-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v4, v6, v4
-; VI-SDAG-NEXT:    v_add_f32_e32 v4, v5, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s2
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
@@ -494,41 +501,43 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_log10_v2f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT:    s_mov_b32 s2, 0x3e9a209a
-; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0x3284fbcf
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x3284fbcf
+; GFX900-SDAG-NEXT:    s_mov_b32 s7, 0x7f800000
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s11, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s11, v1
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s2, -v3
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s3, v4
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, s4
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v4, s3, v4
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v4, v4
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    s_mov_b32 s3, 0x3e9a209a
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v4
+; GFX900-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v4, s3, -v5
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v4, s6, v6
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v6, v1
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v4|, s7
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v6
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, v6, s3, -v3
+; GFX900-SDAG-NEXT:    v_fma_f32 v4, v6, s6, v4
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s10, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, s10, v0
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0x411a209b
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v5, v0, s2, -v4
-; GFX900-SDAG-NEXT:    v_fma_f32 v5, v0, s3, v5
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v3
-; GFX900-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s7
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v0
+; GFX900-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log10_v2f32:
@@ -574,39 +583,37 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s3
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x411a209b, s4
+; GFX1100-SDAG-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x411a209b, s5
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s3, s4
+; GFX1100-SDAG-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s5, 32, 0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v3, s2, s5
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v3, 0x3e9a209a, v1 :: v_dual_lshlrev_b32 v0, 5, v0
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, s3, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v5, 0x3e9a209a, v1, -v3
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v5, 0x3284fbcf, v1
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3e9a209a, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
-; GFX1100-SDAG-NEXT:    v_fma_f32 v4, 0x3e9a209a, v0, -v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v4, 0x3284fbcf, v0
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s4
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v1
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0
-; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5
-; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v3
+; GFX1100-SDAG-NEXT:    v_fma_f32 v6, 0x3e9a209a, v1, -v4
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fma_f32 v7, 0x3e9a209a, v3, -v5
+; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v6, 0x3284fbcf, v1 :: v_dual_fmac_f32 v7, 0x3284fbcf, v3
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_dual_add_f32 v4, v4, v6 :: v_dual_add_f32 v5, v5, v7
+; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v1, v1, v4 :: v_dual_mov_b32 v4, 0
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2
+; GFX1100-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log10_v2f32:
@@ -762,56 +769,59 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-SDAG-LABEL: s_log10_v3f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
-; SI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x411a209b
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3e9a209a
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s11, 0x3284fbcf
-; SI-SDAG-NEXT:    s_mov_b32 s12, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s9, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s9, v1
-; SI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; SI-SDAG-NEXT:    s_mov_b32 s9, 0x3e9a209a
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
-; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s9, -v2
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s11, v3
-; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s12
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s8, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v2, s8, v2
-; SI-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x411a209b
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s9, v3
+; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v3
+; SI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s6, -v4
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3284fbcf
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v6, s8, v6
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s7, v5
+; SI-SDAG-NEXT:    s_mov_b32 s9, 0x7f800000
+; SI-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v2, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v4
-; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v2
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_fma_f32 v5, v2, s9, -v4
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; SI-SDAG-NEXT:    v_fma_f32 v5, v2, s11, v5
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v3, v1
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v6
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_fma_f32 v4, v6, s6, -v3
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; SI-SDAG-NEXT:    v_fma_f32 v4, v6, s7, v4
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, s10, v0
-; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
-; SI-SDAG-NEXT:    v_log_f32_e32 v5, v0
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v2|, s12
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[2:3]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
-; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v2
-; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v5
-; SI-SDAG-NEXT:    v_fma_f32 v4, v5, s9, -v2
-; SI-SDAG-NEXT:    v_fma_f32 v4, v5, s11, v4
-; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v5|, s12
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; SI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
-; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
+; SI-SDAG-NEXT:    v_log_f32_e32 v4, v0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v4
+; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
+; SI-SDAG-NEXT:    v_fma_f32 v5, v4, s6, -v3
+; SI-SDAG-NEXT:    v_fma_f32 v5, v4, s7, v5
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v5
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v4|, s9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    v_sub_f32_e32 v2, v3, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
+; SI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: s_log10_v3f32:
@@ -871,55 +881,59 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; VI-SDAG-LABEL: s_log10_v3f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0x7f800000
-; VI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s10, v1
-; VI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v5, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; VI-SDAG-NEXT:    v_ldexp_f32 v2, s9, v2
-; VI-SDAG-NEXT:    v_log_f32_e32 v3, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x411a209b
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v2
-; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v1
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v5
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; VI-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v3
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v4
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v5
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, s8, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
-; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
+; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v5
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[2:3]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v4, s[0:1]
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-SDAG-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v0, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
+; VI-SDAG-NEXT:    v_ldexp_f32 v6, s1, v6
+; VI-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT:    s_mov_b32 s6, 0x7f800000
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v3, v2
+; VI-SDAG-NEXT:    v_and_b32_e32 v3, 0xfffff000, v6
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v6, v3
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v3
+; VI-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v8, v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v7, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
+; VI-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
+; VI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-SDAG-NEXT:    v_log_f32_e32 v4, v1
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v5
+; VI-SDAG-NEXT:    v_and_b32_e32 v3, 0xfffff000, v4
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v4, v3
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v3
@@ -927,12 +941,12 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
 ; VI-SDAG-NEXT:    v_add_f32_e32 v3, v3, v5
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v3
-; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s4
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, s5
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v4|, s6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v0
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, s3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-SDAG-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
 ; VI-SDAG-NEXT:    s_endpgm
 ;
@@ -1005,55 +1019,58 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_log10_v3f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GFX900-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v3
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, s2
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
-; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s10, v1
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    s_mov_b32 s10, 0x7f800000
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0x411a209b
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s4, -v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v4, v1, s5, v4
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s10
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, s9, v2
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v4, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v2
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v4
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, s8, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v4, s4, -v1
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v4, s5, v6
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v6
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v4|, s10
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[2:3]
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v5, s[0:1]
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v4
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v0, s4, -v4
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v0, s5, v6
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v4, v4, v6
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s10
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v5, vcc
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX900-SDAG-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v3, s4, -v5
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v7, s1, v7
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v3, s5, v6
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s8
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v7
+; GFX900-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX900-SDAG-NEXT:    v_fma_f32 v5, v7, s4, -v3
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX900-SDAG-NEXT:    v_fma_f32 v5, v7, s5, v5
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v5, v1
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s8
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v6
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v5
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v5, s4, -v3
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v5, s5, v6
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s8
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v0
+; GFX900-SDAG-NEXT:    global_store_dwordx3 v4, v[0:2], s[6:7]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
 ; GFX900-GISEL-LABEL: s_log10_v3f32:
@@ -1113,60 +1130,52 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s2
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, 0x411a209b, s6
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x411a209b, s3
+; GFX1100-SDAG-NEXT:    s_and_b32 s3, s3, exec_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s6
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s2, s3
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v4, s1, s6
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 0x411a209b, s7
+; GFX1100-SDAG-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v2 :: v_dual_lshlrev_b32 v0, 5, v0
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, s2, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v8, 0x3e9a209a, v2, -v5
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v8, 0x3284fbcf, v2
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v5, v5, v8
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v6, 0x3e9a209a, v0, -v3
-; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v6, 0x3284fbcf, v0 :: v_dual_lshlrev_b32 v1, 5, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v4, v4
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v5, s0, s2
 ; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v5, v5
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v1
-; GFX1100-SDAG-NEXT:    v_fma_f32 v7, 0x3e9a209a, v1, -v4
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmac_f32_e32 v7, 0x3284fbcf, v1
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v4, v4, v7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v6, 0x3e9a209a, v2 :: v_dual_mul_f32 v7, 0x3e9a209a, v4
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-SDAG-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v10
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v5, vcc_lo
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v9
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v6
+; GFX1100-SDAG-NEXT:    v_fma_f32 v9, 0x3e9a209a, v2, -v6
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fma_f32 v10, 0x3e9a209a, v4, -v7
+; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v9, 0x3284fbcf, v2 :: v_dual_fmac_f32 v10, 0x3284fbcf, v4
+; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v5
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_dual_add_f32 v6, v6, v9 :: v_dual_add_f32 v7, v7, v10
+; GFX1100-SDAG-NEXT:    v_fma_f32 v11, 0x3e9a209a, v5, -v8
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v4|
+; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v11, 0x3284fbcf, v5 :: v_dual_sub_f32 v2, v2, v0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT:    v_add_f32_e32 v8, v8, v11
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v5|
+; GFX1100-SDAG-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_sub_f32 v1, v4, v1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v5, v3
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT:    global_store_b96 v6, v[0:2], s[0:1]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log10_v3f32:
@@ -1387,68 +1396,72 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-SDAG-LABEL: s_log10_v4f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
-; SI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-SDAG-NEXT:    s_mov_b32 s12, 0x3284fbcf
-; SI-SDAG-NEXT:    s_mov_b32 s13, 0x7f800000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3e9a209a
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s11, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s11, v1
-; SI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; SI-SDAG-NEXT:    s_mov_b32 s11, 0x3e9a209a
-; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x411a209b
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s11, -v2
-; SI-SDAG-NEXT:    v_fma_f32 v3, v1, s12, v3
-; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s13
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s10, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v2, s10, v2
-; SI-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v3
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s11, v3
+; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v3
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s6, -v4
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0x3284fbcf
+; SI-SDAG-NEXT:    v_mov_b32_e32 v6, s4
+; SI-SDAG-NEXT:    v_fma_f32 v5, v3, s7, v5
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0x7f800000
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v6, s10, v6
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v1, vcc
+; SI-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s9, v0
-; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v2
-; SI-SDAG-NEXT:    v_fma_f32 v5, v2, s11, -v1
-; SI-SDAG-NEXT:    v_fma_f32 v5, v2, s12, v5
-; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v5
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v5, s9, v5
-; SI-SDAG-NEXT:    v_log_f32_e32 v5, v5
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v2|, s13
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s8, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; SI-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v2
-; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v5
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, s8, v0
-; SI-SDAG-NEXT:    v_fma_f32 v6, v5, s11, -v1
-; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT:    v_fma_f32 v6, v5, s12, v6
-; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v6
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v5|, s13
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[2:3]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v5
-; SI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v0
-; SI-SDAG-NEXT:    v_fma_f32 v6, v0, s11, -v5
-; SI-SDAG-NEXT:    v_fma_f32 v6, v0, s12, v6
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v6
+; SI-SDAG-NEXT:    v_mov_b32_e32 v7, s4
+; SI-SDAG-NEXT:    v_fma_f32 v4, v6, s6, -v2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v7, s9, v7
+; SI-SDAG-NEXT:    v_fma_f32 v4, v6, s7, v4
+; SI-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v5
+; SI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v7
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; SI-SDAG-NEXT:    v_fma_f32 v6, v7, s6, -v5
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; SI-SDAG-NEXT:    v_fma_f32 v6, v7, s7, v6
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s8, v1
 ; SI-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s13
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
-; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
-; SI-SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-SDAG-NEXT:    v_log_f32_e32 v6, v1
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; SI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v4
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v6
+; SI-SDAG-NEXT:    v_fma_f32 v5, v6, s6, -v4
+; SI-SDAG-NEXT:    v_fma_f32 v5, v6, s7, v5
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v4, v0
+; SI-SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: s_log10_v4f32:
@@ -1520,84 +1533,88 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ;
 ; VI-SDAG-LABEL: s_log10_v4f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT:    s_mov_b32 s6, 0x7f800000
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; VI-SDAG-NEXT:    s_mov_b32 s8, 0x7f800000
 ; VI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s11, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s11, v1
-; VI-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
-; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v3, v5, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s10, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; VI-SDAG-NEXT:    v_ldexp_f32 v2, s10, v2
-; VI-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x411a209b
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v3
-; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v2
-; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v2, v1
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v5
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s6
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
+; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v3
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v3, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v5
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v5
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v1
-; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v6, v5
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s9, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v5
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v5, s9, v5
-; VI-SDAG-NEXT:    v_log_f32_e32 v5, v5
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v2|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s8, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v2
-; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v5
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, v5, v1
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, s8, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v6, v8, v6
-; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_add_f32_e32 v6, v7, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v6
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v5|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[2:3]
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v5
-; VI-SDAG-NEXT:    v_and_b32_e32 v5, 0xfffff000, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v6, v0, v5
-; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v6
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, s3
+; VI-SDAG-NEXT:    v_ldexp_f32 v6, s2, v6
+; VI-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s8
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v2
+; VI-SDAG-NEXT:    v_and_b32_e32 v2, 0xfffff000, v6
+; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v6, v2
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v2
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v8, v4
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v7, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT:    v_ldexp_f32 v7, s1, v7
+; VI-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
+; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s8
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v5
+; VI-SDAG-NEXT:    v_and_b32_e32 v5, 0xfffff000, v7
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; VI-SDAG-NEXT:    v_sub_f32_e32 v6, v7, v5
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3e9a2000, v6
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v6
-; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v5
+; VI-SDAG-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v5
+; VI-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; VI-SDAG-NEXT:    v_add_f32_e32 v6, v9, v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-SDAG-NEXT:    v_add_f32_e32 v6, v8, v6
-; VI-SDAG-NEXT:    v_add_f32_e32 v6, v7, v6
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
 ; VI-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v4
+; VI-SDAG-NEXT:    v_log_f32_e32 v6, v1
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s8
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v4
+; VI-SDAG-NEXT:    v_and_b32_e32 v4, 0xfffff000, v6
+; VI-SDAG-NEXT:    v_sub_f32_e32 v5, v6, v4
+; VI-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v5
+; VI-SDAG-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v5
+; VI-SDAG-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v8, v5
+; VI-SDAG-NEXT:    v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
+; VI-SDAG-NEXT:    v_add_f32_e32 v4, v4, v5
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s8
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-SDAG-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1690,67 +1707,71 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ;
 ; GFX900-SDAG-LABEL: s_log10_v4f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; GFX900-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GFX900-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX900-SDAG-NEXT:    s_mov_b32 s8, 0x3e9a209a
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s11, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s11, v1
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    s_mov_b32 s11, 0x7f800000
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0x411a209b
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v3, v1, s4, -v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v3, v1, s5, v3
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s11
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s10, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v2, s10, v2
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v3
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s9, v0
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v2
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v2, s4, -v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v6, v2, s5, v6
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v6
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v6, s9, v6
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v6, v6
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v2|, s11
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v5, s[0:1]
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[0:1], s8, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v2
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v6
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, s8, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v7, v6, s4, -v1
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v7, v6, s5, v7
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v7
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, s11
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, v6, v1, s[2:3]
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v6
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v7, v0, s4, -v6
-; GFX900-SDAG-NEXT:    v_fma_f32 v7, v0, s5, v7
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v3, v3
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v3
+; GFX900-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX900-SDAG-NEXT:    s_mov_b32 s9, 0x3284fbcf
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v3, s8, -v5
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, s3
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v3, s9, v6
+; GFX900-SDAG-NEXT:    s_mov_b32 s10, 0x7f800000
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v7, s2, v7
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s10
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v3, v3, v2
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v7
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, s2
+; GFX900-SDAG-NEXT:    v_fma_f32 v5, v7, s8, -v2
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v8, s1, v8
+; GFX900-SDAG-NEXT:    v_fma_f32 v5, v7, s9, v5
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v8, v8
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s10
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v8
+; GFX900-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX900-SDAG-NEXT:    v_fma_f32 v7, v8, s8, -v6
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX900-SDAG-NEXT:    v_fma_f32 v7, v8, s9, v7
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s11
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[0:1]
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v5
+; GFX900-SDAG-NEXT:    v_log_f32_e32 v7, v1
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v8|, s10
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v1, v1, v5
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v7
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v7, s8, -v5
+; GFX900-SDAG-NEXT:    v_fma_f32 v6, v7, s9, v6
+; GFX900-SDAG-NEXT:    v_add_f32_e32 v5, v5, v6
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s10
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v5, v0
 ; GFX900-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX900-SDAG-NEXT:    s_endpgm
 ;
@@ -1824,68 +1845,61 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s1
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s3
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s6
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s1, v2
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v3, s0, v3
-; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s1
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x411a209b, s6
+; GFX1100-SDAG-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s7
+; GFX1100-SDAG-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s3, s6
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s8
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s7, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s6, s9, exec_lo
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v3, s2, s7
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v6, s1, s3
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v7, s0, s2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v3, v3
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s9
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v8, 0x3e9a209a, v2 :: v_dual_mul_f32 v9, 0x3e9a209a, v3
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v2
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, s3, v0
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v12, 0x3e9a209a, v2, -v7
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v13, 0x3e9a209a, v3, -v8
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v10, 0x3e9a209a, v6 :: v_dual_mul_f32 v11, 0x3e9a209a, v7
+; GFX1100-SDAG-NEXT:    v_fma_f32 v12, 0x3e9a209a, v2, -v8
+; GFX1100-SDAG-NEXT:    v_fma_f32 v13, 0x3e9a209a, v3, -v9
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1100-SDAG-NEXT:    v_fma_f32 v14, 0x3e9a209a, v6, -v10
+; GFX1100-SDAG-NEXT:    v_fma_f32 v15, 0x3e9a209a, v7, -v11
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v7, v7, v12
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_add_f32 v8, v8, v13
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v1
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v10, 0x3e9a209a, v0, -v5
-; GFX1100-SDAG-NEXT:    v_fma_f32 v11, 0x3e9a209a, v1, -v6
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v10, 0x3284fbcf, v0 :: v_dual_fmac_f32 v11, 0x3284fbcf, v1
-; GFX1100-SDAG-NEXT:    v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v5, v2, v7, vcc_lo
+; GFX1100-SDAG-NEXT:    v_dual_fmac_f32 v14, 0x3284fbcf, v6 :: v_dual_fmac_f32 v15, 0x3284fbcf, v7
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_dual_add_f32 v8, v8, v12 :: v_dual_add_f32 v9, v9, v13
+; GFX1100-SDAG-NEXT:    v_dual_add_f32 v10, v10, v14 :: v_dual_add_f32 v11, v11, v15
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_sub_f32 v2, v1, v9
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v6, v3, v8, vcc_lo
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v3, v0, v4
-; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
+; GFX1100-SDAG-NEXT:    v_dual_cndmask_b32 v8, v3, v9 :: v_dual_mov_b32 v9, 0
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v6|
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v7|
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    global_store_b128 v7, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT:    global_store_b128 v9, v[0:3], s[0:1]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log10_v4f32:
@@ -2143,8 +2157,7 @@ define float @v_log10_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -2189,8 +2202,7 @@ define float @v_log10_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -2241,8 +2253,7 @@ define float @v_log10_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -2286,22 +2297,21 @@ define float @v_log10_f32(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2347,8 +2357,7 @@ define float @v_log10_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -2393,8 +2402,7 @@ define float @v_log10_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -2445,8 +2453,7 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -2491,22 +2498,20 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, s0
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2554,8 +2559,7 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -2600,8 +2604,7 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -2652,8 +2655,7 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -2698,22 +2700,20 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e64 s0, 0x80800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, s0
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, -|v0|, v1
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2762,8 +2762,7 @@ define float @v_log10_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -2808,8 +2807,7 @@ define float @v_log10_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -2860,8 +2858,7 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -2905,22 +2902,21 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, -v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2968,8 +2964,7 @@ define float @v_log10_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -2995,8 +2990,7 @@ define float @v_log10_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3022,8 +3016,7 @@ define float @v_log10_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3048,12 +3041,10 @@ define float @v_log10_f32_fast(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
@@ -3088,8 +3079,7 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3115,8 +3105,7 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3142,8 +3131,7 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3168,12 +3156,10 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
@@ -3208,8 +3194,7 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3235,8 +3220,7 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3262,8 +3246,7 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3288,12 +3271,10 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
@@ -3328,8 +3309,7 @@ define float @v_log10_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -3374,8 +3354,7 @@ define float @v_log10_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -3426,8 +3405,7 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -3471,22 +3449,21 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3532,8 +3509,7 @@ define float @v_log10_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3559,8 +3535,7 @@ define float @v_log10_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3586,8 +3561,7 @@ define float @v_log10_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3612,12 +3586,10 @@ define float @v_log10_f32_afn(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
@@ -3681,8 +3653,7 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3708,8 +3679,7 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3735,8 +3705,7 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3761,12 +3730,10 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
@@ -3801,8 +3768,7 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3828,8 +3794,7 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3855,8 +3820,7 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xc11a209b
@@ -3882,11 +3846,10 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, s0
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
@@ -4063,8 +4026,7 @@ define float @v_log10_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -4109,8 +4071,7 @@ define float @v_log10_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -4161,8 +4122,7 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -4206,22 +4166,21 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4407,8 +4366,7 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -4453,8 +4411,7 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -4505,8 +4462,7 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -4550,22 +4506,21 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4751,8 +4706,7 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -4797,8 +4751,7 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -4849,8 +4802,7 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -4894,22 +4846,21 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4955,8 +4906,7 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -4995,8 +4945,7 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5041,8 +4990,7 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -5080,20 +5028,18 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5240,8 +5186,7 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -5280,8 +5225,7 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5326,8 +5270,7 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -5365,20 +5308,18 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5451,8 +5392,7 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -5497,8 +5437,7 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -5549,8 +5488,7 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -5594,22 +5532,21 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6132,8 +6069,7 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
 ; SI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -6302,8 +6238,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_log_f32_e32 v0, v0
 ; SI-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -6326,8 +6261,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-NEXT:    v_log_f32_e32 v0, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -6353,8 +6287,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; GFX900-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -6377,22 +6310,20 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
 ; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 8b3b79b..ebfc953 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -14,18 +14,19 @@
 define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ; SI-SDAG-LABEL: s_log2_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_load_dword s2, s[4:5], 0xb
+; SI-SDAG-NEXT:    s_load_dword s6, s[4:5], 0xb
 ; SI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s2, v1
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s6, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v1, v1
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-SDAG-NEXT:    s_mov_b32 s2, -1
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -52,16 +53,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; VI-SDAG-LABEL: s_log2_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; VI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; VI-SDAG-NEXT:    v_ldexp_f32 v1, s6, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v1, v1
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v2, v1, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -90,17 +92,18 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; GFX900-SDAG-LABEL: s_log2_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX900-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX900-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s6, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX900-SDAG-NEXT:    global_store_dword v2, v0, s[0:1]
@@ -130,13 +133,12 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42000000, s0
+; GFX1100-SDAG-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
 ; GFX1100-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s2, s3
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
@@ -221,14 +223,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-SDAG-NEXT:    s_mov_b32 s6, -1
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s3, v3
+; SI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s2, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; SI-SDAG-NEXT:    v_log_f32_e32 v4, v1
@@ -271,14 +275,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; VI-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v4, v1
@@ -322,14 +328,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX900-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
+; GFX900-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s2, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v4, v1
@@ -365,26 +373,24 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-SDAG-LABEL: s_log2_v2f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s3
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s5
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42000000, s4
+; GFX1100-SDAG-NEXT:    s_and_b32 s4, s4, exec_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s5
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s3, v1
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s5, 32, 0
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v1, s3, s4
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v3, s2, s5
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2
+; GFX1100-SDAG-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v0
+; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v2
 ; GFX1100-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
@@ -488,20 +494,23 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; SI-SDAG-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; SI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; SI-SDAG-NEXT:    s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s1, v3
+; SI-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s1, v3
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v5, s0, v5
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 32, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, s2, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v5, s0, v5
 ; SI-SDAG-NEXT:    v_log_f32_e32 v7, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v5, v5
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v1, vcc
@@ -555,19 +564,22 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, s2
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
 ; VI-SDAG-NEXT:    v_ldexp_f32 v5, s1, v5
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; VI-SDAG-NEXT:    v_log_f32_e32 v5, v5
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
@@ -622,19 +634,22 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX900-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v5, s2
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v5, s1, v5
+; GFX900-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v5, v5
@@ -682,35 +697,31 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX1100-SDAG-NEXT:    s_clause 0x1
 ; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s6
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s3
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s6
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42000000, s3
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; GFX1100-SDAG-NEXT:    s_and_b32 s3, s3, exec_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s6
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s6, s6, exec_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s7
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s2, s3
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v4, s1, s6
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v5, s0, s2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v4, s1, v4
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v5, s0, v5
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v2, v2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v4, v4
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v5, v5
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v1, v4, v1
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s2, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v5, v5
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v2, v2, v0
+; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v5, v3
 ; GFX1100-SDAG-NEXT:    global_store_b96 v6, v[0:2], s[4:5]
 ; GFX1100-SDAG-NEXT:    s_endpgm
@@ -852,24 +863,28 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-SDAG-NEXT:    s_mov_b32 s2, -1
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; SI-SDAG-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; SI-SDAG-NEXT:    s_cselect_b32 s8, 32, 0
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, s8
+; SI-SDAG-NEXT:    s_and_b64 s[8:9], vcc, exec
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s7, v3
+; SI-SDAG-NEXT:    s_cselect_b32 s7, 32, 0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v5, s7
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v0
+; SI-SDAG-NEXT:    v_ldexp_f32_e32 v5, s6, v5
+; SI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; SI-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v7, 5, v7
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v3, s7, v3
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v5, s6, v5
+; SI-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; SI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v7, s5, v7
+; SI-SDAG-NEXT:    s_cselect_b32 s5, 32, 0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, s4, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; SI-SDAG-NEXT:    v_log_f32_e32 v5, v5
@@ -930,26 +945,30 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; VI-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, s6
+; VI-SDAG-NEXT:    s_and_b64 s[6:7], vcc, exec
+; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
+; VI-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; VI-SDAG-NEXT:    v_ldexp_f32 v5, s2, v5
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    v_ldexp_f32 v7, s1, v7
+; VI-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
-; VI-SDAG-NEXT:    v_ldexp_f32 v5, s2, v5
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v7, 5, v7
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; VI-SDAG-NEXT:    v_log_f32_e32 v5, v5
-; VI-SDAG-NEXT:    v_ldexp_f32 v7, s1, v7
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
 ; VI-SDAG-NEXT:    v_log_f32_e32 v7, v7
 ; VI-SDAG-NEXT:    v_log_f32_e32 v8, v1
@@ -1011,24 +1030,28 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX900-SDAG-NEXT:    s_cselect_b32 s4, 32, 0
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v3, s4
+; GFX900-SDAG-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
+; GFX900-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v0
+; GFX900-SDAG-NEXT:    v_ldexp_f32 v6, s2, v6
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v7, 0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX900-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v8, 5, v8
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v3, s3, v3
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v6, s2, v6
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v8, s2
+; GFX900-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v8, s1, v8
+; GFX900-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v3, v3
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v6, v6
@@ -1085,42 +1108,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX1100-SDAG-NEXT:    s_clause 0x1
 ; GFX1100-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s3
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s1
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s6
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s7
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s8
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s9
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42000000, s6
+; GFX1100-SDAG-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s7
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s6, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s7, s7, exec_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, s8
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v7, 5, v7
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s3, v2
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v3, s2, v3
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s9
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v7, s0, v7
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s7, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v2, s3, s6
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s3, 32, 0
+; GFX1100-SDAG-NEXT:    s_and_b32 s6, s9, exec_lo
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v3, s2, s7
+; GFX1100-SDAG-NEXT:    s_cselect_b32 s2, 32, 0
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v6, s1, s3
+; GFX1100-SDAG-NEXT:    v_ldexp_f32 v7, s0, s2
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v8, v3
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_3)
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v7, v7
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v3, v2, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_ldexp_f32 v6, s1, v6
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v2, v8, v1
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v7, v5
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s9
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v6, v6
+; GFX1100-SDAG-NEXT:    v_log_f32_e32 v7, v7
+; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v1, v6, v4
+; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5
 ; GFX1100-SDAG-NEXT:    global_store_b128 v9, v[0:3], s[4:5]
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
@@ -1272,8 +1290,7 @@ define float @v_log2_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1300,8 +1317,7 @@ define float @v_log2_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1328,8 +1344,7 @@ define float @v_log2_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1355,12 +1370,10 @@ define float @v_log2_f32(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1400,8 +1413,7 @@ define float @v_log2_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1428,8 +1440,7 @@ define float @v_log2_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1456,8 +1467,7 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1484,11 +1494,10 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1529,8 +1538,7 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1557,8 +1565,7 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, -|v0|, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1585,8 +1592,7 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, -|v0|, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1613,11 +1619,10 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e64 s0, 0x80800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, -|v0|, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1659,8 +1664,7 @@ define float @v_log2_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, -v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1687,8 +1691,7 @@ define float @v_log2_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1715,8 +1718,7 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1742,12 +1744,10 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, -v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1788,8 +1788,7 @@ define float @v_log2_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1816,8 +1815,7 @@ define float @v_log2_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1844,8 +1842,7 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1871,12 +1868,10 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1916,8 +1911,7 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1944,8 +1938,7 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1972,8 +1965,7 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -1999,12 +1991,10 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2044,8 +2034,7 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2072,8 +2061,7 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2100,8 +2088,7 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2127,12 +2114,10 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2172,8 +2157,7 @@ define float @v_log2_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2200,8 +2184,7 @@ define float @v_log2_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2228,8 +2211,7 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2255,12 +2237,10 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2300,8 +2280,7 @@ define float @v_log2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2328,8 +2307,7 @@ define float @v_log2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2356,8 +2334,7 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2383,12 +2360,10 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2454,8 +2429,7 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2482,8 +2456,7 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2510,8 +2483,7 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2537,12 +2509,10 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2582,8 +2552,7 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2610,8 +2579,7 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2638,8 +2606,7 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2666,11 +2633,10 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, |v0|, v2
+; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2737,8 +2703,7 @@ define float @v_log2_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2765,8 +2730,7 @@ define float @v_log2_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2793,8 +2757,7 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2820,12 +2783,10 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2891,8 +2852,7 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2919,8 +2879,7 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2947,8 +2906,7 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -2974,12 +2932,10 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -3045,8 +3001,7 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3073,8 +3028,7 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3101,8 +3055,7 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3128,12 +3081,10 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -3173,8 +3124,7 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3201,8 +3151,7 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3229,8 +3178,7 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3256,12 +3204,10 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -3327,8 +3273,7 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3355,8 +3300,7 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3383,8 +3327,7 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3410,12 +3353,10 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -3481,8 +3422,7 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3509,8 +3449,7 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3537,8 +3476,7 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3564,12 +3502,10 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX1100-SDAG-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -3738,8 +3674,7 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3806,8 +3741,7 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SI-NEXT:    v_log_f32_e32 v0, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3821,8 +3755,7 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-NEXT:    v_log_f32_e32 v0, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3836,8 +3769,7 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
 ; GFX900-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX900-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0x42000000
@@ -3851,11 +3783,10 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index edc67ec..0c8dbe8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -1324,53 +1324,21 @@ define void @v_set_rounding_select_1_3(i32 %cond) {
 }
 
 define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
-; GFX6-LABEL: s_set_rounding_select_2_0:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX6-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: s_set_rounding_select_2_0:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX7-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: s_set_rounding_select_2_0:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT:    s_mov_b32 s34, 0xa50f
-; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
-; GFX8-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX678-LABEL: s_set_rounding_select_2_0:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX678-NEXT:    s_movk_i32 s34, 0xa5
+; GFX678-NEXT:    s_cselect_b32 s34, s34, 0xa50f
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_set_rounding_select_2_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_mov_b32 s34, 0xa50f
-; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
-; GFX9-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX9-NEXT:    s_movk_i32 s34, 0xa5
+; GFX9-NEXT:    s_cselect_b32 s34, s34, 0xa50f
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1378,11 +1346,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s34, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
-; GFX10-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX10-NEXT:    s_movk_i32 s34, 0xa5
+; GFX10-NEXT:    s_cselect_b32 s34, s34, 0xa50f
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1390,11 +1355,8 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_movk_i32 s0, 0xa5
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0xa50f
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %cond, 0
@@ -1530,10 +1492,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX678-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX678-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX678-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX678-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX678-NEXT:    s_cselect_b32 s34, 4, 0
 ; GFX678-NEXT:    s_add_i32 s35, s34, -4
 ; GFX678-NEXT:    s_min_u32 s34, s34, s35
 ; GFX678-NEXT:    s_lshl_b32 s36, s34, 2
@@ -1547,10 +1506,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX9-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX9-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX9-NEXT:    s_cselect_b32 s34, 4, 0
 ; GFX9-NEXT:    s_add_i32 s35, s34, -4
 ; GFX9-NEXT:    s_min_u32 s34, s34, s35
 ; GFX9-NEXT:    s_lshl_b32 s36, s34, 2
@@ -1564,10 +1520,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s34, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s34
-; GFX10-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX10-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX10-NEXT:    s_cselect_b32 s34, 4, 0
 ; GFX10-NEXT:    s_add_i32 s35, s34, -4
 ; GFX10-NEXT:    s_min_u32 s36, s34, s35
 ; GFX10-NEXT:    s_mov_b32 s34, 0x1c84a50f
@@ -1581,10 +1534,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-NEXT:    s_cselect_b32 s0, 4, 0
 ; GFX11-NEXT:    s_add_i32 s1, s0, -4
 ; GFX11-NEXT:    s_min_u32 s2, s0, s1
 ; GFX11-NEXT:    s_mov_b32 s0, 0x1c84a50f
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index e7b405d..24a4d8f 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -519,8 +519,8 @@ define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(ptr addr
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s6, 1
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -561,8 +561,8 @@ define amdgpu_kernel void @alloca_promote_cmpxchg_private(ptr addrspace(1) %out,
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s6, 1
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index 1e61068..e5e3ba6 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -59,13 +59,9 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
 ; GFX12-SDAG-LABEL: v_s_log_f32:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, s0, v0
-; GFX12-SDAG-NEXT:    s_and_b32 s0, s1, exec_lo
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, s0, s1
 ; GFX12-SDAG-NEXT:    s_cselect_b32 s0, 0x42000000, 0
 ; GFX12-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
@@ -305,12 +301,8 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
 ; GFX12-SDAG-NEXT:    s_and_b32 s1, s0, 0x7fffffff
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s1, 0x800000
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, |s0|, v0
-; GFX12-SDAG-NEXT:    s_and_b32 s0, s1, exec_lo
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, |s0|, s1
 ; GFX12-SDAG-NEXT:    s_cselect_b32 s0, 0x42000000, 0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
 ; GFX12-SDAG-NEXT:    v_log_f32_e32 v0, v0
@@ -342,13 +334,9 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
 ; GFX12-SDAG-LABEL: srcmods_neg_f32:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_cmp_gt_f32 s0, 0x80800000
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, -s0, v0
-; GFX12-SDAG-NEXT:    s_and_b32 s0, s1, exec_lo
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 32, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, -s0, s1
 ; GFX12-SDAG-NEXT:    s_cselect_b32 s0, 0x42000000, 0
 ; GFX12-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
index 81aa3a3..1da55cf 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
@@ -653,4 +653,24 @@ body:             |
     S_ENDPGM 0
 ...
 
-
+---
+name:            test_remat_s_mov_b64_imm_pseudo
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: test_remat_s_mov_b64_imm_pseudo
+    ; GCN: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 1
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_MOV_B64_IMM_PSEUDO 2
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
+    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 3
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
+    ; GCN-NEXT: S_ENDPGM 0
+    %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 1
+    %1:sgpr_64 = S_MOV_B64_IMM_PSEUDO 2
+    %2:sgpr_64 = S_MOV_B64_IMM_PSEUDO 3
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit %1
+    S_NOP 0, implicit %2
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 554e364..b78cbb0 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -19,12 +19,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0x3ff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -106,10 +106,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -199,12 +199,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
-; SI-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], exec
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0x3ff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -286,10 +286,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
-; VI-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -380,12 +380,12 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0xbff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -467,10 +467,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -560,12 +560,12 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 9
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
-; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0xbff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -647,10 +647,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 9
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
-; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -741,12 +741,12 @@ define double @v_rsq_f64(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
@@ -827,8 +827,8 @@ define double @v_rsq_f64(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -910,12 +910,12 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
@@ -996,8 +996,8 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -1080,12 +1080,12 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
@@ -1166,8 +1166,8 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -1249,12 +1249,12 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
@@ -1335,8 +1335,8 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -1418,12 +1418,12 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
@@ -1504,8 +1504,8 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -1588,23 +1588,22 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[4:5]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
 ; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
 ; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
@@ -1743,45 +1742,44 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
 ; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], 1.0
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[9:10], v[5:6]
@@ -1890,23 +1888,22 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[4:5]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
 ; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
 ; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
@@ -2045,45 +2042,44 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
 ; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], -1.0
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[17:18], s[4:5], -1.0, v[2:3], -1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[9:10], v[5:6]
@@ -2191,12 +2187,12 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
@@ -2315,8 +2311,8 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -2434,23 +2430,22 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[4:5]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
 ; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
-; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
 ; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
@@ -2592,45 +2587,44 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
-; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
-; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
 ; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[9:10], v[5:6]
@@ -2738,12 +2732,12 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 9
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
@@ -2824,8 +2818,8 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 9
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -2909,12 +2903,12 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
@@ -2995,8 +2989,8 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -3078,12 +3072,12 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
@@ -3148,8 +3142,8 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -3223,12 +3217,12 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
@@ -3293,8 +3287,8 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -3368,12 +3362,12 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
@@ -3439,8 +3433,8 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -3515,12 +3509,12 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
@@ -3585,8 +3579,8 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -3660,12 +3654,12 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
@@ -3730,8 +3724,8 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -3805,12 +3799,12 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
@@ -3875,8 +3869,8 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -3950,12 +3944,12 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
@@ -4021,8 +4015,8 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -4097,12 +4091,12 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
@@ -4183,8 +4177,8 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -4266,40 +4260,39 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0x260
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
 ; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s[4:5]
 ; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7]
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[8:9], v[0:1]
 ; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[6:7]
 ; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[0:1], v[8:9]
 ; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[8:9], 0.5
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
 ; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
 ; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[6:7], v[0:1]
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[8:9], v[6:7]
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
@@ -4391,11 +4384,10 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
@@ -4417,8 +4409,8 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
 ; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
@@ -4519,12 +4511,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
@@ -4590,10 +4582,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 0x100, 0
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -4675,12 +4667,12 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
@@ -4745,8 +4737,8 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
@@ -5074,24 +5066,24 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0x260
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x260
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5158,8 +5150,8 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
@@ -5241,24 +5233,24 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0x260
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x260
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5325,8 +5317,8 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
@@ -5408,24 +5400,24 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0x260
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT:    v_mov_b32_e32 v6, 0x260
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5492,8 +5484,8 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
@@ -5575,17 +5567,17 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x40700000
-; SI-SDAG-NEXT:    s_mov_b32 s8, 0x40700000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0x40700000
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
@@ -5665,10 +5657,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40700000
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 0acb4a49..cc08708 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1364,12 +1364,12 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0xffc0, v1.l
-; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0xffc0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64:
@@ -1797,17 +1797,22 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] glc dlc
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0xffc0, v1.l
-; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0xffc0, v2.l
-; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0xffc0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0xffc0, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v1, v2, s[0:1] dlc
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1] dlc
+; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir
new file mode 100644
index 0000000..ab960a7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spill-partial-csr-sgpr-live-ins.mir
@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills %s -o /dev/null 2>&1 | FileCheck -check-prefix=VERIFIER %s
+
+# FIXME : Currently, MRI's liveIn check for registers does not take the corresponding live-in's sub-registers into account. As a result
+# in SILowerSGPRSpills, the SubReg spill gets marked KILLED even though its SuperReg is in the function Live-ins. This causes machine
+# verifier to now fail at direct usage of that SubReg, which intially should not be any problem before adding spill.
+
+# VERIFIER: After SI lower SGPR spill instructions
+
+# VERIFIER: *** Bad machine code: Using an undefined physical register ***
+# VERIFIER: - instruction: S_NOP 0, implicit $sgpr50
+# VERIFIER-NEXT: - operand 1:   implicit $sgpr50
+
+# VERIFIER: *** Bad machine code: Using an undefined physical register ***
+# VERIFIER: - instruction: S_NOP 0, implicit $sgpr52
+# VERIFIER-NEXT: - operand 1:   implicit $sgpr52
+
+# VERIFIER: *** Bad machine code: Using an undefined physical register ***
+# VERIFIER: - instruction: S_NOP 0, implicit $sgpr55
+# VERIFIER-NEXT: - operand 1:   implicit $sgpr55
+
+# VERIFIER: LLVM ERROR: Found 3 machine code errors. 
+---
+name: spill_partial_live_csr_sgpr_test
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr50_sgpr51' }
+  - { reg: '$sgpr52_sgpr53' }
+  - { reg: '$sgpr54_sgpr55' }
+body:             |
+  bb.0:
+    liveins: $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55
+
+    S_NOP 0, implicit $sgpr50
+    $sgpr50 = S_MOV_B32 0
+    S_NOP 0, implicit $sgpr52
+    $sgpr52_sgpr53 = S_MOV_B64 0
+    S_NOP 0, implicit $sgpr55
+    $sgpr54_sgpr55 = S_MOV_B64 0
+    $sgpr56 = S_MOV_B32 0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index dd78c2f..a6e6341 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -34,10 +34,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS1-NEXT:    s_load_dwordx4 s[76:79], s[8:9], 0x0
 ; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, 0
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v42, off
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[76:77]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
@@ -46,6 +45,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
@@ -73,13 +73,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS1-NEXT:    s_mov_b32 s70, s16
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS1-NEXT:    s_mov_b32 s71, s15
 ; GLOBALNESS1-NEXT:    s_mov_b32 s72, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
-; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr44_vgpr45
+; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr56_vgpr57
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -106,17 +108,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow28
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0x80
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0
-; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS1-NEXT:    flat_load_dword v40, v[46:47]
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS1-NEXT:    buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS1-NEXT:    flat_load_dword v46, v[0:1]
+; GLOBALNESS1-NEXT:    flat_load_dword v58, v[46:47]
 ; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -160,8 +160,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS1-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    flat_load_dword v0, v[2:3]
+; GLOBALNESS1-NEXT:    flat_load_dword v0, v[44:45]
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
@@ -170,17 +169,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -237,7 +235,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[68:69]
@@ -246,14 +243,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS1-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
@@ -274,14 +271,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS1-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS1-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -326,10 +321,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
 ; GLOBALNESS0-NEXT:    s_load_dwordx4 s[72:75], s[8:9], 0x0
 ; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, 0
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v42, off
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dword v[44:45], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[72:73]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[4:5]
@@ -338,6 +332,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
@@ -365,13 +360,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s68, s16
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS0-NEXT:    s_mov_b32 s69, s15
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
-; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr44_vgpr45
+; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr56_vgpr57
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -398,17 +395,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow28
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0x80
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0
-; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS0-NEXT:    flat_load_dword v40, v[46:47]
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS0-NEXT:    buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS0-NEXT:    flat_load_dword v46, v[0:1]
+; GLOBALNESS0-NEXT:    flat_load_dword v58, v[46:47]
 ; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -452,8 +447,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS0-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    flat_load_dword v0, v[2:3]
+; GLOBALNESS0-NEXT:    flat_load_dword v0, v[44:45]
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
@@ -462,17 +456,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -529,7 +522,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[72:73]
@@ -538,14 +530,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS0-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
@@ -566,14 +558,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS0-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS0-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 5ecfad9..24f5821 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -114,8 +114,11 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v1.l
-; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
+; GFX11-GISEL-REAL16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-GISEL-REAL16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
@@ -243,8 +246,11 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-REAL16-NEXT:    v_subrev_f16_e32 v0.l, 2.0, v1.l
-; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
+; GFX11-GISEL-REAL16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-GISEL-REAL16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-REAL16-NEXT:    v_subrev_f16_e32 v0.l, 2.0, v0.l
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
@@ -486,8 +492,11 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v1.l
-; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
+; GFX11-GISEL-REAL16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-GISEL-REAL16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, |v0.l|, |v0.h|
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
@@ -617,8 +626,11 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v1.l
-; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
+; GFX11-GISEL-REAL16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-GISEL-REAL16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
+; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, -v0.l, -v0.h
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index 0af8c95..0b277f8 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -15,14 +15,11 @@ target datalayout = "A5"
 ; GCN-ALLOCA:         buffer_load_dword
 
 ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
-; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
+; GCN-PROMOTE: s_cselect_b32 [[IND1:s[0-9]+]], 1, 0
 ; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
-; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
+; GCN-PROMOTE: s_cselect_b32 [[IND2:s[0-9]+]], [[IND1]], 2
 ; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
-; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
+; GCN-PROMOTE: s_cselect_b32 [[IND3:s[0-9]+]], [[IND2]], 3
 ; GCN-PROMOTE: ScratchSize: 0
 
 define amdgpu_kernel void @vector_read_alloca_bitcast(ptr addrspace(1) %out, i32 %index) {
@@ -51,7 +48,7 @@ entry:
 ; GCN-ALLOCA-COUNT-5: buffer_store_dword
 ; GCN-ALLOCA:         buffer_load_dword
 
-; GCN-PROMOTE-COUNT-7: v_cndmask
+; GCN-PROMOTE-COUNT-7: s_cselect_b32
 
 ; GCN-PROMOTE: ScratchSize: 0
 
@@ -292,14 +289,11 @@ entry:
 ; GCN-ALLOCA:         buffer_load_dword
 
 ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
-; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
+; GCN-PROMOTE: s_cselect_b32 [[IND1:s[0-9]+]], 1, 0
 ; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
-; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
+; GCN-PROMOTE: s_cselect_b32 [[IND2:s[0-9]+]], [[IND1]], 2
 ; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
-; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
+; GCN-PROMOTE: s_cselect_b32 [[IND3:s[0-9]+]], [[IND2]], 3
 
 ; GCN-PROMOTE: ScratchSize: 0
 
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
index 08f5550..0e7e4b9 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
@@ -4,10 +4,10 @@
 # Check that we get two move-immediates into %1 and %2, instead of a copy from
 # %1 to %2, because that would introduce a dependency and maybe a stall.
 ---
-name: f
+name: remat_v_mov_b32_e32
 tracksRegLiveness: true
 body: |
-  ; CHECK-LABEL: name: f
+  ; CHECK-LABEL: name: remat_v_mov_b32_e32
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr0
@@ -46,3 +46,47 @@ body: |
     %4.sub1:vreg_96 = COPY %2:vgpr_32
     S_ENDPGM 0, implicit %4
 ...
+
+---
+name: remat_v_mov_b64_pseudo
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: remat_v_mov_b64_pseudo
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[COPY]]
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub0_sub1, 0, [[V_MOV_B]].sub0_sub1, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub2_sub3, 0, [[V_MOV_B]].sub2_sub3, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit [[V_MOV_B]]
+  bb.0:
+    liveins: $sgpr0
+    %0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+    %1:vreg_64_align2 = COPY %0:vreg_64_align2
+    %2:vreg_64_align2 = COPY %0:vreg_64_align2
+    %3:sreg_64 = COPY $sgpr0_sgpr1
+    $exec = S_MOV_B64_term %3:sreg_64
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %1:vreg_64_align2 = V_MUL_F64_e64 0, %1:vreg_64_align2, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+    %2:vreg_64_align2 = V_MUL_F64_e64 0, %2:vreg_64_align2, 0, %2:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    undef %4.sub0_sub1:vreg_192 = COPY %1:vreg_64_align2
+    %4.sub2_sub3:vreg_192 = COPY %2:vreg_64_align2
+    S_ENDPGM 0, implicit %4
+...
diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll
index 6427a3e..186276b 100644
--- a/llvm/test/CodeGen/ARM/select-imm.ll
+++ b/llvm/test/CodeGen/ARM/select-imm.ll
@@ -295,15 +295,13 @@ define i32 @t7(i32 %a, i32 %b) nounwind readnone {
 ; ARM-LABEL: t7:
 ; ARM:       @ %bb.0: @ %entry
 ; ARM-NEXT:    subs r0, r0, r1
-; ARM-NEXT:    movne r0, #1
-; ARM-NEXT:    lsl r0, r0, #2
+; ARM-NEXT:    movne r0, #4
 ; ARM-NEXT:    mov pc, lr
 ;
 ; ARMT2-LABEL: t7:
 ; ARMT2:       @ %bb.0: @ %entry
 ; ARMT2-NEXT:    subs r0, r0, r1
-; ARMT2-NEXT:    movwne r0, #1
-; ARMT2-NEXT:    lsl r0, r0, #2
+; ARMT2-NEXT:    movwne r0, #4
 ; ARMT2-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: t7:
@@ -318,8 +316,7 @@ define i32 @t7(i32 %a, i32 %b) nounwind readnone {
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    subs r0, r0, r1
 ; THUMB2-NEXT:    it ne
-; THUMB2-NEXT:    movne r0, #1
-; THUMB2-NEXT:    lsls r0, r0, #2
+; THUMB2-NEXT:    movne r0, #4
 ; THUMB2-NEXT:    bx lr
 ;
 ; V8MBASE-LABEL: t7:
@@ -824,15 +821,13 @@ define i32 @t12(i32 %a) nounwind {
 ; ARM-LABEL: t12:
 ; ARM:       @ %bb.0: @ %entry
 ; ARM-NEXT:    cmp r0, #0
-; ARM-NEXT:    movne r0, #1
-; ARM-NEXT:    lsl r0, r0, #1
+; ARM-NEXT:    movne r0, #2
 ; ARM-NEXT:    mov pc, lr
 ;
 ; ARMT2-LABEL: t12:
 ; ARMT2:       @ %bb.0: @ %entry
 ; ARMT2-NEXT:    cmp r0, #0
-; ARMT2-NEXT:    movwne r0, #1
-; ARMT2-NEXT:    lsl r0, r0, #1
+; ARMT2-NEXT:    movwne r0, #2
 ; ARMT2-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: t12:
@@ -846,8 +841,7 @@ define i32 @t12(i32 %a) nounwind {
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    cmp r0, #0
 ; THUMB2-NEXT:    it ne
-; THUMB2-NEXT:    movne r0, #1
-; THUMB2-NEXT:    lsls r0, r0, #1
+; THUMB2-NEXT:    movne r0, #2
 ; THUMB2-NEXT:    bx lr
 ;
 ; V8MBASE-LABEL: t12:
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
new file mode 100644
index 0000000..c07e8d9
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
@@ -0,0 +1,19 @@
+; Regression test for https://github.com/llvm/llvm-project/issues/128560 -
+; check that cbuffers are populated correctly when there aren't any other kinds
+; of resource.
+
+; RUN: opt -S -passes=dxil-translate-metadata %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+define void @cbuffer_is_only_binding() {
+  %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+      @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, i1 false)
+  ; CHECK: %cbuffer = type
+
+  ret void
+}
+
+; CHECK:      @[[CB0:.*]] = external constant %cbuffer
+
+; CHECK: !{i32 0, ptr @[[CB0]], !""
diff --git a/llvm/test/CodeGen/DirectX/clamp.ll b/llvm/test/CodeGen/DirectX/clamp.ll
index 6345abc..54c6aa5 100644
--- a/llvm/test/CodeGen/DirectX/clamp.ll
+++ b/llvm/test/CodeGen/DirectX/clamp.ll
@@ -286,7 +286,7 @@ declare <3 x half> @llvm.dx.nclamp.v3f16(<3 x half>, <3 x half>, <3 x half>)
 declare <4 x float> @llvm.dx.nclamp.v4f32(<4 x float>, <4 x float>, <4 x float>)
 declare <2 x double> @llvm.dx.nclamp.v2f64(<2 x double>, <2 x double>, <2 x double>)
 declare <4 x i32> @llvm.dx.sclamp.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <3 x i16> @llvm.dx.uclamp.v3i32(<3 x i16>, <3 x i32>, <3 x i16>)
+declare <3 x i16> @llvm.dx.uclamp.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
 declare <4 x i32> @llvm.dx.uclamp.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 declare <2 x i64> @llvm.dx.uclamp.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
 
diff --git a/llvm/test/CodeGen/DirectX/discard.ll b/llvm/test/CodeGen/DirectX/discard.ll
index 2a9ec59..81a96d5 100644
--- a/llvm/test/CodeGen/DirectX/discard.ll
+++ b/llvm/test/CodeGen/DirectX/discard.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes='function(scalarizer),module(dxil-op-lower,dxil-intrinsic-expansion)' -S -mtriple=dxil-pc-shadermodel6.3-pixel %s | FileCheck %s
+; RUN: opt -passes='function(scalarizer),module(dxil-intrinsic-expansion,dxil-op-lower)' -S -mtriple=dxil-pc-shadermodel6.3-pixel %s | FileCheck %s
 
 ; CHECK-LABEL: define void @test_scalar
 ; CHECK: call void @dx.op.discard(i32 82, i1 %0)
diff --git a/llvm/test/CodeGen/DirectX/unsupported_intrinsic.ll b/llvm/test/CodeGen/DirectX/unsupported_intrinsic.ll
new file mode 100644
index 0000000..d703f2f
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/unsupported_intrinsic.ll
@@ -0,0 +1,11 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; CHECK: error: <unknown>:0:0: in function llvm.vector.reduce.and.v4i32 i32 (<4 x i32>): Unsupported intrinsic for DXIL lowering
+define i32 @fn_and(<4 x i32> %0) local_unnamed_addr #0 {
+  %2 = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %0)
+  ret i32 %2
+}
+
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+
+attributes #0 = { convergent norecurse nounwind "hlsl.export"}
diff --git a/llvm/test/CodeGen/Hexagon/bittracker-regclass.ll b/llvm/test/CodeGen/Hexagon/bittracker-regclass.ll
new file mode 100644
index 0000000..436a233
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/bittracker-regclass.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=hexagon -mcpu=hexagonv75 -mattr=+hvxv75,+hvx-length64b,-small-data < %s | FileCheck %s
+
+; Test that the compiler generates code, and doesn't crash, when the compiler
+; creates a DoubleReg value with an IntLow8Reg value. The BitTracker pass
+; needs to handle this register class.
+
+; CHECK: [[REG:r[0-9]+:[0-9]+]] = combine(#33,#32)
+; CHECK: memd({{.*}}) = [[REG]]
+
+@out = external dso_local global [100 x i32], align 512
+@in55 = external dso_local global [55 x i32], align 256
+@.str.3 = external dso_local unnamed_addr constant [29 x i8], align 1
+
+define dso_local void @main(i1 %cond) local_unnamed_addr #0 {
+entry:
+  br label %for.body.i198
+
+for.body.i198:
+  br i1 %cond, label %for.body34.preheader, label %for.body.i198
+
+for.body34.preheader:
+  %wide.load269.5 = load <16 x i32>, <16 x i32>* bitcast (i32* getelementptr inbounds ([100 x i32], [100 x i32]* @out, i32 0, i32 80) to <16 x i32>*), align 64
+  %0 = add nsw <16 x i32> %wide.load269.5, zeroinitializer
+  %rdx.shuf270 = shufflevector <16 x i32> %0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx271 = add <16 x i32> %0, %rdx.shuf270
+  %bin.rdx273 = add <16 x i32> %bin.rdx271, zeroinitializer
+  %bin.rdx275 = add <16 x i32> %bin.rdx273, zeroinitializer
+  %bin.rdx277 = add <16 x i32> %bin.rdx275, zeroinitializer
+  %1 = extractelement <16 x i32> %bin.rdx277, i32 0
+  %add45 = add nsw i32 0, %1
+  %add45.1 = add nsw i32 0, %add45
+  %add45.2 = add nsw i32 0, %add45.1
+  %add45.3 = add nsw i32 0, %add45.2
+  call void (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.3, i32 0, i32 0), i32 %add45.3) #2
+  store i32 32, i32* getelementptr inbounds ([55 x i32], [55 x i32]* @in55, i32 0, i32 32), align 128
+  store i32 33, i32* getelementptr inbounds ([55 x i32], [55 x i32]* @in55, i32 0, i32 33), align 4
+  ret void
+}
+
+declare dso_local void @printf(i8*, ...) local_unnamed_addr #1
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v128i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v128i1.ll
new file mode 100644
index 0000000..ddac8c1
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v128i1.ll
@@ -0,0 +1,39 @@
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s
+
+; CHECK-LABEL: compare_vectors
+; CHECK: [[REG1:(q[0-9]+)]] = vcmp.eq(v{{[0-9]+}}.b,v{{[0-9]+}}.b)
+; CHECK: [[REG2:(r[0-9]+)]] = #-1
+; CHECK: v0 = vand([[REG1]],[[REG2]])
+
+define void @compare_vectors(<128 x i8> %a, <128 x i8> %b) {
+entry:
+  %result = icmp eq <128 x i8> %a, %b
+  call i32 @f.1(<128 x i1> %result)
+  ret void
+}
+
+; CHECK-LABEL: f.1:
+; CHECK: [[REG3:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK: [[REG4:(v[0-9]+)]] = vand([[REG3]],r{{[0-9]+}})
+; CHECK: r{{[0-9]+}} = vextract([[REG4]],r{{[0-9]+}})
+
+define i32 @f.1(<128 x i1> %vec) {
+  %element = extractelement <128 x i1> %vec, i32 6
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v16i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v16i1.ll
new file mode 100644
index 0000000..bbb2697
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v16i1.ll
@@ -0,0 +1,40 @@
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length64b < %s -o - | FileCheck %s --check-prefix=CHECK
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s --check-prefix=CHECK
+
+; CHECK-LABEL: compare_vectors
+; CHECK: [[REG1:(q[0-9]+)]] = vcmp.eq(v{{[0-9]+}}.w,v{{[0-9]+}}.w)
+; CHECK: [[REG2:(r[0-9]+)]] = #-1
+; CHECK: v0  = vand([[REG1]],[[REG2]])
+
+define void @compare_vectors(<16 x i32> %a, <16 x i32> %b) {
+entry:
+  %result = icmp eq <16 x i32> %a, %b
+  call i32 @f.1(<16 x i1> %result)
+  ret void
+}
+
+; CHECK-LABEL: f.1:
+; CHECK: [[REG3:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK: [[REG4:(v[0-9]+)]] = vand([[REG3]],r{{[0-9]+}})
+; CHECK: r{{[0-9]+}} = vextract([[REG4]],r{{[0-9]+}})
+
+define i32 @f.1(<16 x i1> %vec) {
+  %element = extractelement <16 x i1> %vec, i32 6
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v32i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v32i1.ll
new file mode 100644
index 0000000..a734787
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v32i1.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length64b < %s -o - | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s --check-prefix=CHECK-128
+
+; CHECK-LABEL: compare_vectors
+; CHECK-64: [[REG1:(q[0-9]+)]] = vcmp.eq(v{{[0-9]+}}.h,v{{[0-9]+}}.h)
+; CHECK-64: [[REG2:(r[0-9]+)]] = #-1
+; CHECK-64: v0 = vand([[REG1]],[[REG2]])
+; CHECK-128: r{{[0-9]+}}:{{[0-9]+}} = combine(##.LCPI0_0,#-1)
+; CHECK-128: [[REG1:(q[0-9]+)]] = vcmp.eq(v0.h,v1.h)
+; CHECK-128: [[REG2:(v[0-9]+)]] = vand([[REG1]],r{{[0-9]+}})
+; CHECK-128: [[REG3:(v[0-9]+)]] = vmem(r{{[0-9]+}}+#0)
+; CHECK-128: [[REG4:(v[0-9]+)]] = vdelta([[REG2]],[[REG3]])
+; CHECK-128: [[REG5:(q[0-9]+)]] = vand([[REG4]],r{{[0-9]+}})
+; CHECK-128: v0 = vand([[REG5]],r{{[0-9]+}})
+
+define void @compare_vectors(<32 x i16> %a, <32 x i16> %b) {
+entry:
+  %result = icmp eq <32 x i16> %a, %b
+  call i32 @f.1(<32 x i1> %result)
+  ret void
+}
+
+; CHECK-LABEL: f.1:
+; CHECK-64: [[REG3:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK-64: [[REG4:(v[0-9]+)]] = vand([[REG3]],r{{[0-9]+}})
+; CHECK-64: r{{[0-9]+}} = vextract([[REG4]],r{{[0-9]+}})
+; CHECK-128: [[REG6:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK-128: [[REG7:(v[0-9]+)]] = vand([[REG6]],r{{[0-9]+}})
+; CHECK-128: r{{[0-9]+}} = vextract([[REG7]],r{{[0-9]+}})
+
+define i32 @f.1(<32 x i1> %vec) {
+  %element = extractelement <32 x i1> %vec, i32 6
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v4i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v4i1.ll
new file mode 100644
index 0000000..af23e6c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v4i1.ll
@@ -0,0 +1,39 @@
+;RUN: llc -mtriple=hexagon  < %s -o - | FileCheck %s --check-prefix=CHECK
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length64b < %s -o - | FileCheck %s --check-prefix=CHECK
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s --check-prefix=CHECK
+
+; CHECK-LABEL: compare_vectors
+; CHECK: [[REG0:(p[0-9]+)]] = vcmph.eq([[REG1:(r[0-9]+):[0-9]]],[[REG2:(r[0-9]+):[0-9]]])
+; CHECK: [[REG1:(r[0-9]+):[0-9]]] = CONST64(#281479271743489)
+; CHECK: [[REG2:(r[0-9]+):[0-9]]] = mask([[REG0]])
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = and([[REG2]],[[REG1]])
+
+define void @compare_vectors(<4 x i16> %a, <4 x i16> %b) {
+entry:
+  %result = icmp eq <4 x i16> %a, %b
+  call i32 @f.1(<4 x i1> %result)
+  ret void
+}
+; CHECK-LABEL: f.1:
+; CHECK: [[REG3:(r[0-9]+)]] = and([[REG3]],##65537)
+; CHECK: [[REG4:(r[0-9]+)]] = and([[REG4]],##65537)
+define i32 @f.1(<4 x i1> %vec) {
+  %element = extractelement <4 x i1> %vec, i32 2
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v64i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v64i1.ll
new file mode 100644
index 0000000..7cc5620
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v64i1.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length64b < %s -o - | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s --check-prefix=CHECK-128
+
+; CHECK-LABEL: compare_vectors
+; CHECK-64: [[REG1:(q[0-9]+)]] = vcmp.eq(v{{[0-9]+}}.b,v{{[0-9]+}}.b)
+; CHECK-64: [[REG2:(r[0-9]+)]] = #-1
+; CHECK-64: v0 = vand([[REG1]],[[REG2]])
+; CHECK-128: r{{[0-9]+}}:{{[0-9]+}} = combine(##.LCPI0_0,#-1)
+; CHECK-128: [[REG1:(q[0-9]+)]] = vcmp.eq(v0.b,v1.b)
+; CHECK-128: [[REG2:(v[0-9]+)]] = vand([[REG1]],r{{[0-9]+}})
+; CHECK-128: [[REG3:(v[0-9]+)]] = vmem(r{{[0-9]+}}+#0)
+; CHECK-128: [[REG4:(v[0-9]+)]] = vdelta([[REG2]],[[REG3]])
+; CHECK-128: [[REG5:(q[0-9]+)]] = vand([[REG4]],r{{[0-9]+}})
+; CHECK-128: v0 = vand([[REG5]],r{{[0-9]+}})
+
+define void @compare_vectors(<64 x i8> %a, <64 x i8> %b) {
+entry:
+  %result = icmp eq <64 x i8> %a, %b
+  call i32 @f.1(<64 x i1> %result)
+  ret void
+}
+
+; CHECK-LABEL: f.1:
+; CHECK-64: [[REG3:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK-64: [[REG4:(v[0-9]+)]] = vand([[REG3]],r{{[0-9]+}})
+; CHECK-64: r{{[0-9]+}} = vextract([[REG4]],r{{[0-9]+}})
+; CHECK-128: [[REG6:(q[0-9]+)]] = vand(v0,r{{[0-9]+}})
+; CHECK-128: [[REG7:(v[0-9]+)]] = vand([[REG6]],r{{[0-9]+}})
+; CHECK-128: r{{[0-9]+}} = vextract([[REG7]],r{{[0-9]+}})
+
+define i32 @f.1(<64 x i1> %vec) {
+  %element = extractelement <64 x i1> %vec, i32 6
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/Hexagon/calloperand-v8i1.ll b/llvm/test/CodeGen/Hexagon/calloperand-v8i1.ll
new file mode 100644
index 0000000..2ec163a
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/calloperand-v8i1.ll
@@ -0,0 +1,39 @@
+;RUN: llc -mtriple=hexagon  < %s -o - | FileCheck %s --check-prefix=CHECK
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length64b < %s -o - | FileCheck %s --check-prefix=CHECK
+;RUN: llc -mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s -o - | FileCheck %s --check-prefix=CHECK
+
+; CHECK-LABEL: compare_vectors
+; CHECK: [[REG0:(p[0-9]+)]] = vcmpb.eq([[REG1:(r[0-9]+):[0-9]]],[[REG2:(r[0-9]+):[0-9]]])
+; CHECK: [[REG1:(r[0-9]+):[0-9]]] = CONST64(#72340172838076673)
+; CHECK: [[REG2:(r[0-9]+):[0-9]]] = mask([[REG0]])
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = and([[REG2]],[[REG1]])
+
+define void @compare_vectors(<8 x i8> %a, <8 x i8> %b) {
+entry:
+  %result = icmp eq <8 x i8> %a, %b
+  call i32 @f.1(<8 x i1> %result)
+  ret void
+}
+; CHECK-LABEL: f.1:
+; CHECK: [[REG3:(r[0-9]+)]] = and([[REG3]],##16843009)
+; CHECK: [[REG4:(r[0-9]+)]] = and([[REG4]],##16843009)
+define i32 @f.1(<8 x i1> %vec) {
+  %element = extractelement <8 x i1> %vec, i32 6
+  %is_true = icmp eq i1 %element, true
+  br i1 %is_true, label %if_true, label %if_false
+
+if_true:
+  call void @action_if_true()
+  br label %end
+
+if_false:
+  call void @action_if_false()
+  br label %end
+
+end:
+  %result = phi i32 [1, %if_true], [0, %if_false]
+  ret i32 %result
+}
+
+declare void @action_if_true()
+declare void @action_if_false()
diff --git a/llvm/test/CodeGen/MSP430/shift-amount-threshold.ll b/llvm/test/CodeGen/MSP430/shift-amount-threshold.ll
index 8166c46..1ffae4f 100644
--- a/llvm/test/CodeGen/MSP430/shift-amount-threshold.ll
+++ b/llvm/test/CodeGen/MSP430/shift-amount-threshold.ll
@@ -115,13 +115,12 @@ define i16 @testShiftAnd_1(i16 %x) {
 ; CHECK-LABEL: testShiftAnd_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    mov r12, r13
-; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    mov #2, r12
 ; CHECK-NEXT:    tst r13
 ; CHECK-NEXT:    jl .LBB6_2
 ; CHECK-NEXT:  ; %bb.1: ; %entry
 ; CHECK-NEXT:    clr r12
 ; CHECK-NEXT:  .LBB6_2: ; %entry
-; CHECK-NEXT:    add r12, r12
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp slt i16 %x, 0
diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
index c144de4..4c5c44a 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_32 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_32 | %ptxas-verify %}
 
@@ -22,129 +23,305 @@ declare double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
 declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
 declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
 
-; CHECK-LABEL: test_ldu_i8
 define i8 @test_ldu_i8(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.u8
+; CHECK-LABEL: test_ldu_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_i8_param_0];
+; CHECK-NEXT:    ldu.global.u8 %rs1, [%rd1];
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    and.b32 %r2, %r1, 255;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
-; CHECK-LABEL: test_ldu_i16
 define i16 @test_ldu_i16(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.u16
+; CHECK-LABEL: test_ldu_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_i16_param_0];
+; CHECK-NEXT:    ldu.global.u16 %rs1, [%rd1];
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %val = tail call i16 @llvm.nvvm.ldu.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
   ret i16 %val
 }
 
-; CHECK-LABEL: test_ldu_i32
 define i32 @test_ldu_i32(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.u32
+; CHECK-LABEL: test_ldu_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_i32_param_0];
+; CHECK-NEXT:    ldu.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
-; CHECK-LABEL: test_ldu_i64
 define i64 @test_ldu_i64(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.u64
+; CHECK-LABEL: test_ldu_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_i64_param_0];
+; CHECK-NEXT:    ldu.global.u64 %rd2, [%rd1];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
   ret i64 %val
 }
 
-; CHECK-LABEL: test_ldu_p
 define ptr @test_ldu_p(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.u64
+; CHECK-LABEL: test_ldu_p(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_p_param_0];
+; CHECK-NEXT:    ldu.global.u64 %rd2, [%rd1];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = tail call ptr @llvm.nvvm.ldu.global.p.p1(ptr addrspace(1) %ptr, i32 8)
   ret ptr %val
 }
 
-
-; CHECK-LABEL: test_ldu_f32
 define float @test_ldu_f32(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.f32
+; CHECK-LABEL: test_ldu_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_f32_param_0];
+; CHECK-NEXT:    ldu.global.f32 %f1, [%rd1];
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT:    ret;
   %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
   ret float %val
 }
 
-; CHECK-LABEL: test_ldu_f64
 define double @test_ldu_f64(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.f64
+; CHECK-LABEL: test_ldu_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NEXT:    .reg .f64 %fd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_f64_param_0];
+; CHECK-NEXT:    ldu.global.f64 %fd1, [%rd1];
+; CHECK-NEXT:    st.param.f64 [func_retval0], %fd1;
+; CHECK-NEXT:    ret;
   %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
   ret double %val
 }
 
-; CHECK-LABEL: test_ldu_f16
 define half @test_ldu_f16(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.u16
+; CHECK-LABEL: test_ldu_f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_f16_param_0];
+; CHECK-NEXT:    ldu.global.u16 %rs1, [%rd1];
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
   %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
   ret half %val
 }
 
-; CHECK-LABEL: test_ldu_v2f16
 define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
-  ; CHECK: ldu.global.u32
+; CHECK-LABEL: test_ldu_v2f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_v2f16_param_0];
+; CHECK-NEXT:    ldu.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
   ret <2 x half> %val
 }
 
-; CHECK-LABEL: test_ldg_i8
 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.u8
+; CHECK-LABEL: test_ldg_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_i8_param_0];
+; CHECK-NEXT:    ld.global.nc.u8 %rs1, [%rd1];
+; CHECK-NEXT:    cvt.u32.u8 %r1, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
   ret i8 %val
 }
 
-; CHECK-LABEL: test_ldg_i16
 define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.u16
+; CHECK-LABEL: test_ldg_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_i16_param_0];
+; CHECK-NEXT:    ld.global.nc.u16 %rs1, [%rd1];
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
   ret i16 %val
 }
 
-; CHECK-LABEL: test_ldg_i32
 define i32 @test_ldg_i32(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.u32
+; CHECK-LABEL: test_ldg_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_i32_param_0];
+; CHECK-NEXT:    ld.global.nc.u32 %r1, [%rd1];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
   ret i32 %val
 }
 
-; CHECK-LABEL: test_ldg_i64
 define i64 @test_ldg_i64(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.u64
+; CHECK-LABEL: test_ldg_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_i64_param_0];
+; CHECK-NEXT:    ld.global.nc.u64 %rd2, [%rd1];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
   ret i64 %val
 }
 
-; CHECK-LABEL: test_ldg_p
 define ptr @test_ldg_p(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.u64
+; CHECK-LABEL: test_ldg_p(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_p_param_0];
+; CHECK-NEXT:    ld.global.nc.u64 %rd2, [%rd1];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = tail call ptr @llvm.nvvm.ldg.global.p.p1(ptr addrspace(1) %ptr, i32 8)
   ret ptr %val
 }
 
-; CHECK-LABEL: test_ldg_f32
 define float @test_ldg_f32(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.f32
+; CHECK-LABEL: test_ldg_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_f32_param_0];
+; CHECK-NEXT:    ld.global.nc.f32 %f1, [%rd1];
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT:    ret;
   %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
   ret float %val
 }
 
-; CHECK-LABEL: test_ldg_f64
 define double @test_ldg_f64(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.f64
+; CHECK-LABEL: test_ldg_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NEXT:    .reg .f64 %fd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_f64_param_0];
+; CHECK-NEXT:    ld.global.nc.f64 %fd1, [%rd1];
+; CHECK-NEXT:    st.param.f64 [func_retval0], %fd1;
+; CHECK-NEXT:    ret;
   %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
   ret double %val
 }
 
-; CHECK-LABEL: test_ldg_f16
 define half @test_ldg_f16(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.u16
+; CHECK-LABEL: test_ldg_f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_f16_param_0];
+; CHECK-NEXT:    ld.global.nc.u16 %rs1, [%rd1];
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
+; CHECK-NEXT:    ret;
   %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
   ret half %val
 }
 
-; CHECK-LABEL: test_ldg_v2f16
 define <2 x half> @test_ldg_v2f16(ptr addrspace(1) %ptr) {
-  ; CHECK: ld.global.nc.u32
+; CHECK-LABEL: test_ldg_v2f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_v2f16_param_0];
+; CHECK-NEXT:    ld.global.nc.u32 %r1, [%rd1];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
   ret <2 x half> %val
 }
+
+@g = addrspace(1) global i32 0
+
+define i32 @test_ldg_asi() {
+; CHECK-LABEL: test_ldg_asi(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.global.nc.u32 %r1, [g+4];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g, i32 4), i32 4)
+  ret i32 %val
+}
+
+define i32 @test_lug_asi() {
+; CHECK-LABEL: test_lug_asi(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ldu.global.u32 %r1, [g+4];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g, i32 4), i32 4)
+  ret i32 %val
+}
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 4d4db21..377528b 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -214,34 +214,33 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<6>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot3;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
-; CHECK-PTX-NEXT:    mov.u64 %rd3, __const_$_bar_$_s1;
-; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs1, [%rd3+7];
+; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs1, [__const_$_bar_$_s1+7];
 ; CHECK-PTX-NEXT:    cvt.u16.u8 %rs2, %rs1;
 ; CHECK-PTX-NEXT:    st.local.u8 [%rd2+2], %rs2;
-; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs3, [%rd3+6];
+; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs3, [__const_$_bar_$_s1+6];
 ; CHECK-PTX-NEXT:    cvt.u16.u8 %rs4, %rs3;
 ; CHECK-PTX-NEXT:    st.local.u8 [%rd2+1], %rs4;
-; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs5, [%rd3+5];
+; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs5, [__const_$_bar_$_s1+5];
 ; CHECK-PTX-NEXT:    cvt.u16.u8 %rs6, %rs5;
 ; CHECK-PTX-NEXT:    st.local.u8 [%rd2], %rs6;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
 ; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
 ; CHECK-PTX-NEXT:    mov.b16 %rs7, 1;
 ; CHECK-PTX-NEXT:    st.u8 [%SP+12], %rs7;
-; CHECK-PTX-NEXT:    mov.b64 %rd4, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd4;
-; CHECK-PTX-NEXT:    add.u64 %rd5, %SP, 8;
+; CHECK-PTX-NEXT:    mov.b64 %rd3, 1;
+; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd3;
+; CHECK-PTX-NEXT:    add.u64 %rd4, %SP, 8;
 ; CHECK-PTX-NEXT:    { // callseq 1, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
 ; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd5;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd4;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
 ; CHECK-PTX-NEXT:    call.uni (retval0),
 ; CHECK-PTX-NEXT:    variadics2,
@@ -380,28 +379,27 @@ define dso_local void @qux() {
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<3>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<10>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<9>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot7;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
-; CHECK-PTX-NEXT:    mov.u64 %rd3, __const_$_qux_$_s;
-; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd4, [%rd3+8];
-; CHECK-PTX-NEXT:    st.local.u64 [%rd2+8], %rd4;
-; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd5, [__const_$_qux_$_s];
-; CHECK-PTX-NEXT:    st.local.u64 [%rd2], %rd5;
-; CHECK-PTX-NEXT:    mov.b64 %rd6, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd6;
-; CHECK-PTX-NEXT:    ld.local.u64 %rd7, [%rd2];
-; CHECK-PTX-NEXT:    ld.local.u64 %rd8, [%rd2+8];
-; CHECK-PTX-NEXT:    add.u64 %rd9, %SP, 16;
+; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd3, [__const_$_qux_$_s+8];
+; CHECK-PTX-NEXT:    st.local.u64 [%rd2+8], %rd3;
+; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd4, [__const_$_qux_$_s];
+; CHECK-PTX-NEXT:    st.local.u64 [%rd2], %rd4;
+; CHECK-PTX-NEXT:    mov.b64 %rd5, 1;
+; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd5;
+; CHECK-PTX-NEXT:    ld.local.u64 %rd6, [%rd2];
+; CHECK-PTX-NEXT:    ld.local.u64 %rd7, [%rd2+8];
+; CHECK-PTX-NEXT:    add.u64 %rd8, %SP, 16;
 ; CHECK-PTX-NEXT:    { // callseq 3, 0
 ; CHECK-PTX-NEXT:    .param .align 8 .b8 param0[16];
-; CHECK-PTX-NEXT:    st.param.b64 [param0], %rd7;
-; CHECK-PTX-NEXT:    st.param.b64 [param0+8], %rd8;
+; CHECK-PTX-NEXT:    st.param.b64 [param0], %rd6;
+; CHECK-PTX-NEXT:    st.param.b64 [param0+8], %rd7;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd9;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd8;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
 ; CHECK-PTX-NEXT:    call.uni (retval0),
 ; CHECK-PTX-NEXT:    variadics4,
diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincos.ll b/llvm/test/CodeGen/PowerPC/llvm.sincos.ll
new file mode 100644
index 0000000..aaf81ff
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/llvm.sincos.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-gnu-linux \
+; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s
+
+define { ppc_fp128, ppc_fp128 } @test_sincos_ppcf128(ppc_fp128 %a) {
+; CHECK-LABEL: test_sincos_ppcf128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -64(r1)
+; CHECK-NEXT:    std r0, 80(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 48
+; CHECK-NEXT:    addi r6, r1, 32
+; CHECK-NEXT:    bl sincosl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lfd f1, 48(r1)
+; CHECK-NEXT:    lfd f2, 56(r1)
+; CHECK-NEXT:    lfd f3, 32(r1)
+; CHECK-NEXT:    lfd f4, 40(r1)
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = call { ppc_fp128, ppc_fp128 } @llvm.sincos.ppcf128(ppc_fp128 %a)
+  ret { ppc_fp128, ppc_fp128 } %result
+}
+
+define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128(ppc_fp128 %a) {
+; CHECK-LABEL: test_sincospi_ppcf128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -64(r1)
+; CHECK-NEXT:    std r0, 80(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 48
+; CHECK-NEXT:    addi r6, r1, 32
+; CHECK-NEXT:    bl sincospil
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lfd f1, 48(r1)
+; CHECK-NEXT:    lfd f2, 56(r1)
+; CHECK-NEXT:    lfd f3, 32(r1)
+; CHECK-NEXT:    lfd f4, 40(r1)
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
+  ret { ppc_fp128, ppc_fp128 } %result
+}
+
+; FIXME: This could be made a tail call with the default expansion of llvm.sincos.
+define void @test_sincos_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias %out_sin, ptr noalias %out_cos) {
+; CHECK-LABEL: test_sincos_ppcf128_void_tail_call:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl sincosl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincos.ppcf128(ppc_fp128 %a)
+  %result.0 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
+  %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
+  store ppc_fp128 %result.0, ptr %out_sin, align 16
+  store ppc_fp128 %result.1, ptr %out_cos, align 16
+  ret void
+}
+
+; FIXME: This could be made a tail call with the default expansion of llvm.sincospi.
+define void @test_sincospi_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias %out_sin, ptr noalias %out_cos) {
+; CHECK-LABEL: test_sincospi_ppcf128_void_tail_call:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl sincospil
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
+  %result.0 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
+  %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
+  store ppc_fp128 %result.0, ptr %out_sin, align 16
+  store ppc_fp128 %result.1, ptr %out_cos, align 16
+  ret void
+}
+
+; NOTE: This would need a struct-return library call for llvm.sincos to become a tail call.
+define { ppc_fp128, ppc_fp128 } @test_sincos_ppcf128_tail_call(ppc_fp128 %a) {
+; CHECK-LABEL: test_sincos_ppcf128_tail_call:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -64(r1)
+; CHECK-NEXT:    std r0, 80(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 48
+; CHECK-NEXT:    addi r6, r1, 32
+; CHECK-NEXT:    bl sincosl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lfd f1, 48(r1)
+; CHECK-NEXT:    lfd f2, 56(r1)
+; CHECK-NEXT:    lfd f3, 32(r1)
+; CHECK-NEXT:    lfd f4, 40(r1)
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincos.ppcf128(ppc_fp128 %a)
+  ret { ppc_fp128, ppc_fp128 } %result
+}
+
+; NOTE: This would need a struct-return library call for llvm.sincospi to become a tail call.
+define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128_tail_call(ppc_fp128 %a) {
+; CHECK-LABEL: test_sincospi_ppcf128_tail_call:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -64(r1)
+; CHECK-NEXT:    std r0, 80(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 48
+; CHECK-NEXT:    addi r6, r1, 32
+; CHECK-NEXT:    bl sincospil
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lfd f1, 48(r1)
+; CHECK-NEXT:    lfd f2, 56(r1)
+; CHECK-NEXT:    lfd f3, 32(r1)
+; CHECK-NEXT:    lfd f4, 40(r1)
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
+  ret { ppc_fp128, ppc_fp128 } %result
+}
diff --git a/llvm/test/CodeGen/RISCV/or-is-add.ll b/llvm/test/CodeGen/RISCV/or-is-add.ll
index 7356167..ab20312 100644
--- a/llvm/test/CodeGen/RISCV/or-is-add.ll
+++ b/llvm/test/CodeGen/RISCV/or-is-add.ll
@@ -11,8 +11,8 @@ define signext i32 @test1(i32 signext %x) {
 ;
 ; RV64-LABEL: test1:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slliw a0, a0, 1
-; RV64-NEXT:    addi a0, a0, 1
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    addiw a0, a0, 1
 ; RV64-NEXT:    ret
   %a = shl i32 %x, 1
   %b = or i32 %a, 1
@@ -45,8 +45,8 @@ define signext i32 @test3(i32 signext %x) {
 ;
 ; RV64-LABEL: test3:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slliw a0, a0, 3
-; RV64-NEXT:    addi a0, a0, 6
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    addiw a0, a0, 6
 ; RV64-NEXT:    ret
   %a = shl i32 %x, 3
   %b = add i32 %a, 6
@@ -83,7 +83,7 @@ define signext i32 @test5(i32 signext %x) {
 ; RV64-LABEL: test5:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    srliw a0, a0, 24
-; RV64-NEXT:    addi a0, a0, 256
+; RV64-NEXT:    addiw a0, a0, 256
 ; RV64-NEXT:    ret
   %a = lshr i32 %x, 24
   %b = xor i32 %a, 256
@@ -101,7 +101,7 @@ define i64 @test6(i64 %x) {
 ; RV64-LABEL: test6:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    srli a0, a0, 54
-; RV64-NEXT:    addi a0, a0, 1024
+; RV64-NEXT:    addiw a0, a0, 1024
 ; RV64-NEXT:    ret
   %a = lshr i64 %x, 54
   %b = xor i64 %a, 1024
@@ -121,3 +121,105 @@ define signext i32 @test7(i32 signext %x) {
   %a = or disjoint i32 %x, 1
   ret i32 %a
 }
+
+define void @pr128468(ptr %0, i32 signext %1, i32 signext %2) {
+; RV32-LABEL: pr128468:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a3, a1, 3
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    lw a2, 4(a3)
+; RV32-NEXT:    bgez a2, .LBB7_6
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    slli a2, a1, 1
+; RV32-NEXT:    addi a2, a2, 1
+; RV32-NEXT:    beq a2, a1, .LBB7_6
+; RV32-NEXT:  # %bb.2: # %.preheader
+; RV32-NEXT:    addi a3, a3, 4
+; RV32-NEXT:    j .LBB7_4
+; RV32-NEXT:  .LBB7_3: # in Loop: Header=BB7_4 Depth=1
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    addi a3, a3, 4
+; RV32-NEXT:    beq a1, a1, .LBB7_6
+; RV32-NEXT:  .LBB7_4: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    sw a4, 0(a3)
+; RV32-NEXT:    slli a3, a2, 3
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    lw a2, 4(a3)
+; RV32-NEXT:    bgez a2, .LBB7_3
+; RV32-NEXT:  # %bb.5: # in Loop: Header=BB7_4 Depth=1
+; RV32-NEXT:    slli a2, a1, 1
+; RV32-NEXT:    addi a2, a2, 1
+; RV32-NEXT:    addi a3, a3, 4
+; RV32-NEXT:    bne a2, a1, .LBB7_4
+; RV32-NEXT:  .LBB7_6:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: pr128468:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slliw a2, a1, 1
+; RV64-NEXT:    slli a3, a2, 2
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    lw a4, 4(a3)
+; RV64-NEXT:    bgez a4, .LBB7_6
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    addiw a2, a2, 1
+; RV64-NEXT:    beq a2, a1, .LBB7_6
+; RV64-NEXT:  # %bb.2: # %.preheader
+; RV64-NEXT:    addi a3, a3, 4
+; RV64-NEXT:    j .LBB7_4
+; RV64-NEXT:  .LBB7_3: # in Loop: Header=BB7_4 Depth=1
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    addi a3, a3, 4
+; RV64-NEXT:    beq a1, a1, .LBB7_6
+; RV64-NEXT:  .LBB7_4: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    lw a4, 0(a1)
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    slliw a2, a2, 1
+; RV64-NEXT:    sw a4, 0(a3)
+; RV64-NEXT:    slli a3, a2, 2
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    lw a4, 4(a3)
+; RV64-NEXT:    bgez a4, .LBB7_3
+; RV64-NEXT:  # %bb.5: # in Loop: Header=BB7_4 Depth=1
+; RV64-NEXT:    addiw a2, a2, 1
+; RV64-NEXT:    addi a3, a3, 4
+; RV64-NEXT:    bne a2, a1, .LBB7_4
+; RV64-NEXT:  .LBB7_6:
+; RV64-NEXT:    ret
+  %4 = shl nsw i32 %1, 1
+  %5 = or disjoint i32 %4, 1
+  %6 = sext i32 %5 to i64
+  %7 = getelementptr inbounds i32, ptr %0, i64 %6
+  %8 = load i32, ptr %7, align 4
+  %9 = icmp sgt i32 %8, -1
+  %10 = icmp eq i32 %5, %1
+  %11 = or i1 %9, %10
+  br i1 %11, label %27, label %12
+
+12:                                               ; preds = %3, %12
+  %13 = phi i32 [ %25, %12 ], [ %5, %3 ]
+  %14 = phi ptr [ %22, %12 ], [ %7, %3 ]
+  %15 = phi i32 [ %13, %12 ], [ %1, %3 ]
+  %16 = sext i32 %15 to i64
+  %17 = getelementptr inbounds i32, ptr %0, i64 %16
+  %18 = load i32, ptr %17, align 4
+  store i32 %18, ptr %14, align 4
+  %19 = shl nsw i32 %13, 1
+  %20 = or disjoint i32 %19, 1
+  %21 = sext i32 %20 to i64
+  %22 = getelementptr inbounds i32, ptr %0, i64 %21
+  %23 = load i32, ptr %22, align 4
+  %24 = icmp slt i32 %23, 0
+  %25 = select i1 %24, i32 %20, i32 %13
+  %26 = icmp eq i32 %25, %13
+  br i1 %26, label %27, label %12
+
+27:                                               ; preds = %12, %3
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/select-const.ll b/llvm/test/CodeGen/RISCV/select-const.ll
index 90a81c5..4538572 100644
--- a/llvm/test/CodeGen/RISCV/select-const.ll
+++ b/llvm/test/CodeGen/RISCV/select-const.ll
@@ -114,7 +114,7 @@ define signext i32 @select_const_int_harder(i1 zeroext %a) nounwind {
 ; RV64ZICOND:       # %bb.0:
 ; RV64ZICOND-NEXT:    li a1, 32
 ; RV64ZICOND-NEXT:    czero.nez a0, a1, a0
-; RV64ZICOND-NEXT:    addi a0, a0, 6
+; RV64ZICOND-NEXT:    addiw a0, a0, 6
 ; RV64ZICOND-NEXT:    ret
   %1 = select i1 %a, i32 6, i32 38
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index 4405cc3..303c4ac2 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1884,15 +1884,22 @@ define i32 @select_cst_diff2(i1 zeroext %cond) {
 ; RV64IMXVTCONDOPS:       # %bb.0:
 ; RV64IMXVTCONDOPS-NEXT:    li a1, 2
 ; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
-; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 120
+; RV64IMXVTCONDOPS-NEXT:    addiw a0, a0, 120
 ; RV64IMXVTCONDOPS-NEXT:    ret
 ;
-; CHECKZICOND-LABEL: select_cst_diff2:
-; CHECKZICOND:       # %bb.0:
-; CHECKZICOND-NEXT:    li a1, 2
-; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
-; CHECKZICOND-NEXT:    addi a0, a0, 120
-; CHECKZICOND-NEXT:    ret
+; RV32IMZICOND-LABEL: select_cst_diff2:
+; RV32IMZICOND:       # %bb.0:
+; RV32IMZICOND-NEXT:    li a1, 2
+; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    addi a0, a0, 120
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_cst_diff2:
+; RV64IMZICOND:       # %bb.0:
+; RV64IMZICOND-NEXT:    li a1, 2
+; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    addiw a0, a0, 120
+; RV64IMZICOND-NEXT:    ret
   %ret = select i1 %cond, i32 120, i32 122
   ret i32 %ret
 }
@@ -2074,15 +2081,22 @@ define i32 @select_cst_diff8_invert(i1 zeroext %cond) {
 ; RV64IMXVTCONDOPS:       # %bb.0:
 ; RV64IMXVTCONDOPS-NEXT:    li a1, 8
 ; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
-; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 6
+; RV64IMXVTCONDOPS-NEXT:    addiw a0, a0, 6
 ; RV64IMXVTCONDOPS-NEXT:    ret
 ;
-; CHECKZICOND-LABEL: select_cst_diff8_invert:
-; CHECKZICOND:       # %bb.0:
-; CHECKZICOND-NEXT:    li a1, 8
-; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
-; CHECKZICOND-NEXT:    addi a0, a0, 6
-; CHECKZICOND-NEXT:    ret
+; RV32IMZICOND-LABEL: select_cst_diff8_invert:
+; RV32IMZICOND:       # %bb.0:
+; RV32IMZICOND-NEXT:    li a1, 8
+; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    addi a0, a0, 6
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_cst_diff8_invert:
+; RV64IMZICOND:       # %bb.0:
+; RV64IMZICOND-NEXT:    li a1, 8
+; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    addiw a0, a0, 6
+; RV64IMZICOND-NEXT:    ret
   %ret = select i1 %cond, i32 6, i32 14
   ret i32 %ret
 }
@@ -2151,15 +2165,22 @@ define i32 @select_cst_diff1024_invert(i1 zeroext %cond) {
 ; RV64IMXVTCONDOPS:       # %bb.0:
 ; RV64IMXVTCONDOPS-NEXT:    li a1, 1024
 ; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
-; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 6
+; RV64IMXVTCONDOPS-NEXT:    addiw a0, a0, 6
 ; RV64IMXVTCONDOPS-NEXT:    ret
 ;
-; CHECKZICOND-LABEL: select_cst_diff1024_invert:
-; CHECKZICOND:       # %bb.0:
-; CHECKZICOND-NEXT:    li a1, 1024
-; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
-; CHECKZICOND-NEXT:    addi a0, a0, 6
-; CHECKZICOND-NEXT:    ret
+; RV32IMZICOND-LABEL: select_cst_diff1024_invert:
+; RV32IMZICOND:       # %bb.0:
+; RV32IMZICOND-NEXT:    li a1, 1024
+; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    addi a0, a0, 6
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_cst_diff1024_invert:
+; RV64IMZICOND:       # %bb.0:
+; RV64IMZICOND-NEXT:    li a1, 1024
+; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    addiw a0, a0, 6
+; RV64IMZICOND-NEXT:    ret
   %ret = select i1 %cond, i32 6, i32 1030
   ret i32 %ret
 }
diff --git a/llvm/test/CodeGen/Thumb/branchless-cmp.ll b/llvm/test/CodeGen/Thumb/branchless-cmp.ll
index 40c5b885..e5bfb87 100644
--- a/llvm/test/CodeGen/Thumb/branchless-cmp.ll
+++ b/llvm/test/CodeGen/Thumb/branchless-cmp.ll
@@ -1,101 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m0 %s -verify-machineinstrs -o - | FileCheck %s
 
 define i32 @test1a(i32 %a, i32 %b) {
+; CHECK-LABEL: test1a:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, r0, r1
+; CHECK-NEXT:    subs r1, r0, #1
+; CHECK-NEXT:    sbcs r0, r1
+; CHECK-NEXT:    bx lr
 entry:
   %cmp = icmp ne i32 %a, %b
   %cond = zext i1 %cmp to i32
   ret i32 %cond
-; CHECK-LABEL: test1a:
-; CHECK-NOT: b{{(ne)|(eq)}}
-; CHECK:       subs r0, r0, r1
-; CHECK-NEXT:  subs r1, r0, #1
-; CHECK-NEXT:  sbcs r0, r1
 }
 
 define i32 @test1b(i32 %a, i32 %b) {
+; CHECK-LABEL: test1b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r1, r0, r1
+; CHECK-NEXT:    rsbs r0, r1, #0
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    bx lr
 entry:
   %cmp = icmp eq i32 %a, %b
   %cond = zext i1 %cmp to i32
   ret i32 %cond
-; CHECK-LABEL: test1b:
-; CHECK-NOT: b{{(ne)|(eq)}}
-; CHECK:       subs    r1, r0, r1
-; CHECK-NEXT:  rsbs    r0, r1, #0
-; CHECK-NEXT:  adcs    r0, r1
 }
 
 define i32 @test2a(i32 %a, i32 %b) {
+; CHECK-LABEL: test2a:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r1, r0, r1
+; CHECK-NEXT:    rsbs r0, r1, #0
+; CHECK-NEXT:    adcs r0, r1
+; CHECK-NEXT:    bx lr
 entry:
   %cmp = icmp eq i32 %a, %b
   %cond = zext i1 %cmp to i32
   ret i32 %cond
-; CHECK-LABEL: test2a:
-; CHECK-NOT: b{{(ne)|(eq)}}
-; CHECK:       subs    r1, r0, r1
-; CHECK-NEXT:  rsbs    r0, r1, #0
-; CHECK-NEXT:  adcs    r0, r1
 }
 
 define i32 @test2b(i32 %a, i32 %b) {
+; CHECK-LABEL: test2b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, r0, r1
+; CHECK-NEXT:    subs r1, r0, #1
+; CHECK-NEXT:    sbcs r0, r1
+; CHECK-NEXT:    bx lr
 entry:
   %cmp = icmp ne i32 %a, %b
   %cond = zext i1 %cmp to i32
   ret i32 %cond
-; CHECK-LABEL: test2b:
-; CHECK-NOT: b{{(ne)|(eq)}}
-; CHECK:       subs    r0, r0, r1
-; CHECK-NEXT:  subs    r1, r0, #1
-; CHECK-NEXT:  sbcs    r0, r1
 }
 
 define i32 @test3a(i32 %a, i32 %b) {
+; CHECK-LABEL: test3a:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, r0, r1
+; CHECK-NEXT:    beq .LBB4_2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:    movs r0, #4
+; CHECK-NEXT:  .LBB4_2: @ %entry
+; CHECK-NEXT:    bx lr
 entry:
   %cmp = icmp eq i32 %a, %b
   %cond = select i1 %cmp, i32 0, i32 4
   ret i32 %cond
-; CHECK-LABEL: test3a:
-; CHECK-NOT: b{{(ne)|(eq)}}
-; CHECK:       subs    r0, r0, r1
-; CHECK-NEXT:  subs    r1, r0, #1
-; CHECK-NEXT:  sbcs    r0, r1
-; CHECK-NEXT:  lsls    r0, r0, #2
 }
 
 define i32 @test3b(i32 %a, i32 %b) {
+; CHECK-LABEL: test3b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    beq .LBB5_2
+; CHECK-NEXT:  @ %bb.1: @ %entry
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:  .LBB5_2:
+; CHECK-NEXT:    movs r0, #4
+; CHECK-NEXT:    bx lr
 entry:
   %cmp = icmp eq i32 %a, %b
   %cond = select i1 %cmp, i32 4, i32 0
   ret i32 %cond
-; CHECK-LABEL: test3b:
-; CHECK-NOT: b{{(ne)|(eq)}}
-; CHECK:      subs	r0, r0, r1
-; CHECK-NEXT: rsbs	r1, r0, #0
-; CHECK-NEXT: adcs	r1, r0
-; CHECK-NEXT: lsls	r0, r1, #2
 }
 
 define i32 @test4a(i32 %a, i32 %b) {
+; CHECK-LABEL: test4a:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    bne .LBB6_2
+; CHECK-NEXT:  @ %bb.1: @ %entry
+; CHECK-NEXT:    movs r0, #4
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    bx lr
 entry:
   %cmp = icmp ne i32 %a, %b
   %cond = select i1 %cmp, i32 0, i32 4
   ret i32 %cond
-; CHECK-LABEL: test4a:
-; CHECK-NOT: b{{(ne)|(eq)}}
-; CHECK:      subs	r0, r0, r1
-; CHECK-NEXT: rsbs	r1, r0, #0
-; CHECK-NEXT: adcs	r1, r0
-; CHECK-NEXT: lsls	r0, r1, #2
 }
 
 define i32 @test4b(i32 %a, i32 %b) {
+; CHECK-LABEL: test4b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, r0, r1
+; CHECK-NEXT:    subs r1, r0, #1
+; CHECK-NEXT:    sbcs r0, r1
+; CHECK-NEXT:    lsls r0, r0, #2
+; CHECK-NEXT:    bx lr
 entry:
   %cmp = icmp ne i32 %a, %b
   %cond = select i1 %cmp, i32 4, i32 0
   ret i32 %cond
-; CHECK-LABEL: test4b:
-; CHECK-NOT: b{{(ne)|(eq)}}
-; CHECK:       subs  r0, r0, r1
-; CHECK-NEXT:  subs  r1, r0, #1
-; CHECK-NEXT:  sbcs  r0, r1
-; CHECK-NEXT:  lsls  r0, r0, #2
 }
diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception.ll
index febab82..57d1f37 100644
--- a/llvm/test/CodeGen/WebAssembly/exception.ll
+++ b/llvm/test/CodeGen/WebAssembly/exception.ll
@@ -566,6 +566,32 @@ unreachable:                                      ; preds = %entry
   unreachable
 }
 
+; This tests whether llvm.wasm.throw intrinsic can invoked and iseled correctly.
+
+; CHECK-LABEL: invoke_throw:
+; CHECK: try_table    (catch __cpp_exception 0)
+; CHECK:   local.get  0
+; CHECK:   throw     __cpp_exception
+; CHECK: end_try_table
+define void @invoke_throw(ptr %p) personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @llvm.wasm.throw(i32 0, ptr %p)
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr null]
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call ptr @__cxa_begin_catch(ptr %2) #4 [ "funclet"(token %1) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %catch, %entry
+  ret void
+}
 
 declare void @foo()
 declare void @bar(ptr)
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index 5f0ba4a..4e8ff59 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -172,6 +172,16 @@ define <8 x i1> @compare_oge_v8f16 (<8 x half> %x, <8 x half> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_ule_v8f16:
+; CHECK-NEXT: .functype compare_ule_v8f16 (v128, v128) -> (v128){{$}}
+; CHECK-NEXT: f16x8.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i1> @compare_ule_v8f16 (<8 x half> %x, <8 x half> %y) {
+  %res = fcmp ule <8 x half> %x, %y
+  ret <8 x i1> %res
+}
+
 ; CHECK-LABEL: abs_v8f16:
 ; CHECK-NEXT:  .functype abs_v8f16 (v128) -> (v128)
 ; CHECK-NEXT:  f16x8.abs $push0=, $0
@@ -335,3 +345,27 @@ define void @store_v8f16(<8 x half> %v, ptr %p) {
   store <8 x half> %v , ptr %p
   ret void
 }
+
+; ==============================================================================
+; Shuffle
+; ==============================================================================
+define <8 x half> @shuffle_v8f16(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: shuffle_v8f16:
+; CHECK:         .functype shuffle_v8f16 (v128, v128) -> (v128)
+; CHECK-NEXT:    i8x16.shuffle $push0=, $0, $1, 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK-NEXT:    return $pop0
+  %res = shufflevector <8 x half> %x, <8 x half> %y,
+    <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x half> %res
+}
+
+define <8 x half> @shuffle_poison_v8f16(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: shuffle_poison_v8f16:
+; CHECK:         .functype shuffle_poison_v8f16 (v128, v128) -> (v128)
+; CHECK-NEXT:    i8x16.shuffle $push0=, $0, $0, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK-NEXT:    return $pop0
+  %res = shufflevector <8 x half> %x, <8 x half> %y,
+    <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison,
+               i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <8 x half> %res
+}
diff --git a/llvm/test/CodeGen/X86/andnot-blsmsk.ll b/llvm/test/CodeGen/X86/andnot-blsmsk.ll
new file mode 100644
index 0000000..7476682
--- /dev/null
+++ b/llvm/test/CodeGen/X86/andnot-blsmsk.ll
@@ -0,0 +1,327 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-- -mattr=-bmi,+sse2 | FileCheck %s --check-prefixes=X86,X86-NOBMI
+; RUN: llc < %s -mtriple=i686-- -mattr=+bmi,+sse2 | FileCheck %s --check-prefixes=X86,X86-BMI
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
+
+declare void @use(i32)
+
+define i32 @fold_and_xor_neg_v1_32(i32 %x, i32 %y) nounwind {
+; X86-NOBMI-LABEL: fold_and_xor_neg_v1_32:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    negl %eax
+; X86-NOBMI-NEXT:    xorl %ecx, %eax
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI-LABEL: fold_and_xor_neg_v1_32:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    blsmskl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI-NEXT:    retl
+;
+; X64-NOBMI-LABEL: fold_and_xor_neg_v1_32:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    negl %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    andl %esi, %eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: fold_and_xor_neg_v1_32:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    blsmskl %edi, %eax
+; X64-BMI-NEXT:    andnl %esi, %eax, %eax
+; X64-BMI-NEXT:    retq
+  %neg = sub i32 0, %x
+  %xor = xor i32 %x, %neg
+  %and = and i32 %xor, %y
+  ret i32 %and
+}
+
+define i32 @fold_and_xor_neg_v2_32(i32 %x, i32 %y) nounwind {
+; X86-NOBMI-LABEL: fold_and_xor_neg_v2_32:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    negl %eax
+; X86-NOBMI-NEXT:    xorl %ecx, %eax
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI-LABEL: fold_and_xor_neg_v2_32:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    blsmskl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI-NEXT:    retl
+;
+; X64-NOBMI-LABEL: fold_and_xor_neg_v2_32:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    negl %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    andl %esi, %eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: fold_and_xor_neg_v2_32:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    blsmskl %edi, %eax
+; X64-BMI-NEXT:    andnl %esi, %eax, %eax
+; X64-BMI-NEXT:    retq
+  %neg = sub i32 0, %x
+  %xor = xor i32 %x, %neg
+  %and = and i32 %y, %xor
+  ret i32 %and
+}
+
+define i32 @fold_and_xor_neg_v3_32(i32 %x, i32 %y) nounwind {
+; X86-NOBMI-LABEL: fold_and_xor_neg_v3_32:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    negl %eax
+; X86-NOBMI-NEXT:    xorl %ecx, %eax
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI-LABEL: fold_and_xor_neg_v3_32:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    blsmskl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI-NEXT:    retl
+;
+; X64-NOBMI-LABEL: fold_and_xor_neg_v3_32:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    negl %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    andl %esi, %eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: fold_and_xor_neg_v3_32:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    blsmskl %edi, %eax
+; X64-BMI-NEXT:    andnl %esi, %eax, %eax
+; X64-BMI-NEXT:    retq
+  %neg = sub i32 0, %x
+  %xor = xor i32 %neg, %x
+  %and = and i32 %xor, %y
+  ret i32 %and
+}
+
+define i32 @fold_and_xor_neg_v4_32(i32 %x, i32 %y) nounwind {
+; X86-NOBMI-LABEL: fold_and_xor_neg_v4_32:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    negl %eax
+; X86-NOBMI-NEXT:    xorl %ecx, %eax
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI-LABEL: fold_and_xor_neg_v4_32:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    blsmskl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI-NEXT:    retl
+;
+; X64-NOBMI-LABEL: fold_and_xor_neg_v4_32:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    negl %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    andl %esi, %eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: fold_and_xor_neg_v4_32:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    blsmskl %edi, %eax
+; X64-BMI-NEXT:    andnl %esi, %eax, %eax
+; X64-BMI-NEXT:    retq
+  %neg = sub i32 0, %x
+  %xor = xor i32 %neg, %x
+  %and = and i32 %y, %xor
+  ret i32 %and
+}
+
+define i64 @fold_and_xor_neg_v1_64(i64 %x, i64 %y) nounwind {
+; X86-LABEL: fold_and_xor_neg_v1_64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-NOBMI-LABEL: fold_and_xor_neg_v1_64:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    negq %rax
+; X64-NOBMI-NEXT:    xorq %rdi, %rax
+; X64-NOBMI-NEXT:    andq %rsi, %rax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: fold_and_xor_neg_v1_64:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    blsmskq %rdi, %rax
+; X64-BMI-NEXT:    andnq %rsi, %rax, %rax
+; X64-BMI-NEXT:    retq
+  %neg = sub i64 0, %x
+  %xor = xor i64 %x, %neg
+  %and = and i64 %xor, %y
+  ret i64 %and
+}
+
+; Negative test
+define i16 @fold_and_xor_neg_v1_16_negative(i16 %x, i16 %y) nounwind {
+; X86-LABEL: fold_and_xor_neg_v1_16_negative:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: fold_and_xor_neg_v1_16_negative:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    xorl %edi, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %neg = sub i16 0, %x
+  %xor = xor i16 %x, %neg
+  %and = and i16 %xor, %y
+  ret i16 %and
+}
+
+; Negative test
+define <4 x i32> @fold_and_xor_neg_v1_v4x32_negative(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X86-LABEL: fold_and_xor_neg_v1_v4x32_negative:
+; X86:       # %bb.0:
+; X86-NEXT:    pxor %xmm2, %xmm2
+; X86-NEXT:    psubd %xmm0, %xmm2
+; X86-NEXT:    pxor %xmm2, %xmm0
+; X86-NEXT:    pand %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: fold_and_xor_neg_v1_v4x32_negative:
+; X64:       # %bb.0:
+; X64-NEXT:    pxor %xmm2, %xmm2
+; X64-NEXT:    psubd %xmm0, %xmm2
+; X64-NEXT:    pxor %xmm2, %xmm0
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    retq
+  %neg = sub <4 x i32> zeroinitializer, %x
+  %xor = xor <4 x i32> %x, %neg
+  %and = and <4 x i32> %xor, %y
+  ret <4 x i32> %and
+}
+
+; Negative test
+define i32 @fold_and_xor_neg_v1_32_two_uses_xor_negative(i32 %x, i32 %y) nounwind {
+; X86-LABEL: fold_and_xor_neg_v1_32_two_uses_xor_negative:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl %ecx, %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: fold_and_xor_neg_v1_32_two_uses_xor_negative:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movl %esi, %ebx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    xorl %eax, %edi
+; X64-NEXT:    andl %edi, %ebx
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+  %neg = sub i32 0, %x
+  %xor = xor i32 %x, %neg
+  %and = and i32 %xor, %y
+  call void @use(i32 %xor)
+  ret i32 %and
+}
+
+; Negative test
+define i32 @fold_and_xor_neg_v1_32_two_uses_sub_negative(i32 %x, i32 %y) nounwind {
+; X86-LABEL: fold_and_xor_neg_v1_32_two_uses_sub_negative:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: fold_and_xor_neg_v1_32_two_uses_sub_negative:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movl %edi, %ebx
+; X64-NEXT:    negl %edi
+; X64-NEXT:    xorl %edi, %ebx
+; X64-NEXT:    andl %esi, %ebx
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+  %neg = sub i32 0, %x
+  %xor = xor i32 %x, %neg
+  %and = and i32 %xor, %y
+  call void @use(i32 %neg)
+  ret i32 %and
+}
+
+; Negative test
+define i32 @fold_and_xor_neg_v1_32_no_blsmsk_negative(i32 %x, i32 %y, i32 %z) nounwind {
+; X86-LABEL: fold_and_xor_neg_v1_32_no_blsmsk_negative:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: fold_and_xor_neg_v1_32_no_blsmsk_negative:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    xorl %edi, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    retq
+  %neg = sub i32 0, %z
+  %xor = xor i32 %x, %neg
+  %and = and i32 %xor, %y
+  ret i32 %and
+}
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index e513b66..67c9e7c 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1688,3 +1688,31 @@ define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind {
   %mul = fdiv float 0x3a20000000000000, %conv
   ret float %mul
 }
+
+define x86_fp80 @pr128528(i1 %cond) {
+; CHECK-SSE-LABEL: pr128528:
+; CHECK-SSE:       # %bb.0:
+; CHECK-SSE-NEXT:    testb $1, %dil
+; CHECK-SSE-NEXT:    movl $8, %eax
+; CHECK-SSE-NEXT:    movl $1, %ecx
+; CHECK-SSE-NEXT:    cmovnel %eax, %ecx
+; CHECK-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-SSE-NEXT:    fildl -{{[0-9]+}}(%rsp)
+; CHECK-SSE-NEXT:    fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX-LABEL: pr128528:
+; CHECK-AVX:       # %bb.0:
+; CHECK-AVX-NEXT:    testb $1, %dil
+; CHECK-AVX-NEXT:    movl $8, %eax
+; CHECK-AVX-NEXT:    movl $1, %ecx
+; CHECK-AVX-NEXT:    cmovnel %eax, %ecx
+; CHECK-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-AVX-NEXT:    fildl -{{[0-9]+}}(%rsp)
+; CHECK-AVX-NEXT:    fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
+; CHECK-AVX-NEXT:    retq
+  %sub9 = select i1 %cond, i32 8, i32 1
+  %conv = uitofp i32 %sub9 to x86_fp80
+  %mul = fmul x86_fp80 %conv, 0xK4007D055555555555800
+  ret x86_fp80 %mul
+}
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll
index bb75ec1..0831675 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -921,3 +921,271 @@ entry:
   ret fp128 %call
 }
 declare fp128 @llvm.fma.f128(fp128, fp128, fp128)
+
+define fp128 @Test128Acos(fp128 %a) nounwind {
+; ANDROID-LABEL: Test128Acos:
+; ANDROID:       # %bb.0:
+; ANDROID-NEXT:    jmp acosl@PLT # TAILCALL
+;
+; GNU-LABEL: Test128Acos:
+; GNU:       # %bb.0:
+; GNU-NEXT:    jmp acosf128@PLT # TAILCALL
+;
+; X86-LABEL: Test128Acos:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll acosl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl $4
+  %x = call fp128 @llvm.acos.f128(fp128 %a)
+  ret fp128 %x
+}
+
+define fp128 @Test128Asin(fp128 %a) nounwind {
+; ANDROID-LABEL: Test128Asin:
+; ANDROID:       # %bb.0:
+; ANDROID-NEXT:    jmp asinl@PLT # TAILCALL
+;
+; GNU-LABEL: Test128Asin:
+; GNU:       # %bb.0:
+; GNU-NEXT:    jmp asinf128@PLT # TAILCALL
+;
+; X86-LABEL: Test128Asin:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll asinl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl $4
+  %x = call fp128 @llvm.asin.f128(fp128 %a)
+  ret fp128 %x
+}
+
+define fp128 @Test128Atan(fp128 %a) nounwind {
+; ANDROID-LABEL: Test128Atan:
+; ANDROID:       # %bb.0:
+; ANDROID-NEXT:    jmp atanl@PLT # TAILCALL
+;
+; GNU-LABEL: Test128Atan:
+; GNU:       # %bb.0:
+; GNU-NEXT:    jmp atanf128@PLT # TAILCALL
+;
+; X86-LABEL: Test128Atan:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll atanl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl $4
+  %x = call fp128 @llvm.atan.f128(fp128 %a)
+  ret fp128 %x
+}
+
+define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
+; ANDROID-LABEL: Test128Atan2:
+; ANDROID:       # %bb.0:
+; ANDROID-NEXT:    jmp atan2l@PLT # TAILCALL
+;
+; GNU-LABEL: Test128Atan2:
+; GNU:       # %bb.0:
+; GNU-NEXT:    jmp atan2f128@PLT # TAILCALL
+;
+; X86-LABEL: Test128Atan2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll atan2l
+; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl $4
+  %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b)
+  ret fp128 %x
+}
+
+define fp128 @Test128Cosh(fp128 %a) nounwind {
+; ANDROID-LABEL: Test128Cosh:
+; ANDROID:       # %bb.0:
+; ANDROID-NEXT:    jmp coshl@PLT # TAILCALL
+;
+; GNU-LABEL: Test128Cosh:
+; GNU:       # %bb.0:
+; GNU-NEXT:    jmp coshf128@PLT # TAILCALL
+;
+; X86-LABEL: Test128Cosh:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll coshl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl $4
+  %x = call fp128 @llvm.cosh.f128(fp128 %a)
+  ret fp128 %x
+}
+
+define fp128 @Test128Sinh(fp128 %a) nounwind {
+; ANDROID-LABEL: Test128Sinh:
+; ANDROID:       # %bb.0:
+; ANDROID-NEXT:    jmp sinhl@PLT # TAILCALL
+;
+; GNU-LABEL: Test128Sinh:
+; GNU:       # %bb.0:
+; GNU-NEXT:    jmp sinhf128@PLT # TAILCALL
+;
+; X86-LABEL: Test128Sinh:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll sinhl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl $4
+  %x = call fp128 @llvm.sinh.f128(fp128 %a)
+  ret fp128 %x
+}
+
+define fp128 @Test128Tan(fp128 %a) nounwind {
+; ANDROID-LABEL: Test128Tan:
+; ANDROID:       # %bb.0:
+; ANDROID-NEXT:    jmp tanl@PLT # TAILCALL
+;
+; GNU-LABEL: Test128Tan:
+; GNU:       # %bb.0:
+; GNU-NEXT:    jmp tanf128@PLT # TAILCALL
+;
+; X86-LABEL: Test128Tan:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll tanl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl $4
+  %x = call fp128 @llvm.tan.f128(fp128 %a)
+  ret fp128 %x
+}
+
+define fp128 @Test128Tanh(fp128 %a) nounwind {
+; ANDROID-LABEL: Test128Tanh:
+; ANDROID:       # %bb.0:
+; ANDROID-NEXT:    jmp tanhl@PLT # TAILCALL
+;
+; GNU-LABEL: Test128Tanh:
+; GNU:       # %bb.0:
+; GNU-NEXT:    jmp tanhf128@PLT # TAILCALL
+;
+; X86-LABEL: Test128Tanh:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll tanhl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl $4
+  %x = call fp128 @llvm.tanh.f128(fp128 %a)
+  ret fp128 %x
+}
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index 3af8b1a..57db963 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -1188,3 +1188,472 @@ define void @test_half_trunc(half %a0, ptr %p0) nounwind {
   store half %res, ptr %p0, align 2
   ret void
 }
+
+define half @test_half_acos(half %a) nounwind {
+; F16C-LABEL: test_half_acos:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    callq acosf@PLT
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test_half_acos:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    callq acosf@PLT
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test_half_acos:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    callq acosf@PLT
+; X64-NEXT:    callq __truncsfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_half_acos:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll acosf
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %x = call half @llvm.acos.f16(half %a)
+  ret half %x
+}
+
+define half @test_half_asin(half %a) nounwind {
+; F16C-LABEL: test_half_asin:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    callq asinf@PLT
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test_half_asin:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    callq asinf@PLT
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test_half_asin:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    callq asinf@PLT
+; X64-NEXT:    callq __truncsfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_half_asin:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll asinf
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %x = call half @llvm.asin.f16(half %a)
+  ret half %x
+}
+
+define half @test_half_atan(half %a) nounwind {
+; F16C-LABEL: test_half_atan:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    callq atanf@PLT
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test_half_atan:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    callq atanf@PLT
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test_half_atan:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    callq atanf@PLT
+; X64-NEXT:    callq __truncsfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_half_atan:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll atanf
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %x = call half @llvm.atan.f16(half %a)
+  ret half %x
+}
+
+define half @test_half_atan2(half %a, half %b) nounwind {
+; F16C-LABEL: test_half_atan2:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
+; F16C-NEXT:    callq atan2f@PLT
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test_half_atan2:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; FP16-NEXT:    callq atan2f@PLT
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test_half_atan2:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; X64-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; X64-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
+; X64-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    callq atan2f@PLT
+; X64-NEXT:    callq __truncsfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_half_atan2:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll atan2f
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    addl $60, %esp
+; X86-NEXT:    retl
+  %x = call half @llvm.atan2.f16(half %a, half %b)
+  ret half %x
+}
+
+define half @test2_half_cos(half %Val) nounwind {
+; F16C-LABEL: test2_half_cos:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    callq cosf@PLT
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test2_half_cos:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    callq cosf@PLT
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test2_half_cos:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    callq cosf@PLT
+; X64-NEXT:    callq __truncsfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test2_half_cos:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll cosf
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %res = call half @llvm.cos.f16(half %Val)
+  ret half %res
+}
+
+define half @test_half_cosh(half %a) nounwind {
+; F16C-LABEL: test_half_cosh:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    callq coshf@PLT
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test_half_cosh:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    callq coshf@PLT
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test_half_cosh:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    callq coshf@PLT
+; X64-NEXT:    callq __truncsfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_half_cosh:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll coshf
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %x = call half @llvm.cosh.f16(half %a)
+  ret half %x
+}
+
+define half @test2_half_sin(half %Val) nounwind {
+; F16C-LABEL: test2_half_sin:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    callq sinf@PLT
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test2_half_sin:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    callq sinf@PLT
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test2_half_sin:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    callq sinf@PLT
+; X64-NEXT:    callq __truncsfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test2_half_sin:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sinf
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %res = call half @llvm.sin.f16(half %Val)
+  ret half %res
+}
+
+define half @test_half_sinh(half %a) nounwind {
+; F16C-LABEL: test_half_sinh:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    callq sinhf@PLT
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test_half_sinh:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    callq sinhf@PLT
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test_half_sinh:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    callq sinhf@PLT
+; X64-NEXT:    callq __truncsfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_half_sinh:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sinhf
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %x = call half @llvm.sinh.f16(half %a)
+  ret half %x
+}
+
+define half @test2_half_tan(half %a) nounwind {
+; F16C-LABEL: test2_half_tan:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    callq tanf@PLT
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test2_half_tan:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    callq tanf@PLT
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test2_half_tan:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    callq tanf@PLT
+; X64-NEXT:    callq __truncsfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test2_half_tan:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll tanf
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %x = call half @llvm.tan.f16(half %a)
+  ret half %x
+}
+
+define half @test_half_tanh(half %a) nounwind {
+; F16C-LABEL: test_half_tanh:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    callq tanhf@PLT
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test_half_tanh:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    callq tanhf@PLT
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test_half_tanh:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __extendhfsf2@PLT
+; X64-NEXT:    callq tanhf@PLT
+; X64-NEXT:    callq __truncsfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_half_tanh:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll tanhf
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %x = call half @llvm.tanh.f16(half %a)
+  ret half %x
+}
diff --git a/llvm/test/CodeGen/X86/llvm.acos.ll b/llvm/test/CodeGen/X86/llvm.acos.ll
index 202fde8..9ae6749 100644
--- a/llvm/test/CodeGen/X86/llvm.acos.ll
+++ b/llvm/test/CodeGen/X86/llvm.acos.ll
@@ -1,70 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-
-define half @use_acosf16(half %a) nounwind {
-; CHECK-LABEL: use_acosf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    callq acosf@PLT
-; CHECK-NEXT:    callq __truncsfhf2@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call half @llvm.acos.f16(half %a)
-  ret half %x
-}
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64
 
 define float @use_acosf32(float %a) nounwind {
-; CHECK-LABEL: use_acosf32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp acosf@PLT # TAILCALL
+; X86-LABEL: use_acosf32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll acosf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_acosf32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp acosf@PLT # TAILCALL
   %x = call float @llvm.acos.f32(float %a)
   ret float %x
 }
 
 define double @use_acosf64(double %a) nounwind {
-; CHECK-LABEL: use_acosf64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp acos@PLT # TAILCALL
+; X86-LABEL: use_acosf64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll acos
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_acosf64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp acos@PLT # TAILCALL
   %x = call double @llvm.acos.f64(double %a)
   ret double %x
 }
 
 define x86_fp80 @use_acosf80(x86_fp80 %a) nounwind {
-; CHECK-LABEL: use_acosf80:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    fldt 32(%rsp)
-; CHECK-NEXT:    fstpt (%rsp)
-; CHECK-NEXT:    callq acosl@PLT
-; CHECK-NEXT:    addq  $24, %rsp
-; CHECK-NEXT:    retq
+; X86-LABEL: use_acosf80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll acosl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_acosf80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq acosl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
   %x = call x86_fp80 @llvm.acos.f80(x86_fp80 %a)
   ret x86_fp80 %x
 }
 
-define fp128 @use_acosfp128(fp128 %a) nounwind {
-; CHECK-LABEL: use_acosfp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp  acosf128@PLT # TAILCALL
-  %x = call fp128 @llvm.acos.f128(fp128 %a)
-  ret fp128 %x
-}
-
-define ppc_fp128 @use_acosppc_fp128(ppc_fp128 %a) nounwind {
-; CHECK-LABEL: use_acosppc_fp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq acosl@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call ppc_fp128 @llvm.acos.ppcf128(ppc_fp128 %a)
-  ret ppc_fp128 %x
-}
-
-declare half @llvm.acos.f16(half)
 declare float @llvm.acos.f32(float)
 declare double @llvm.acos.f64(double)
 declare x86_fp80 @llvm.acos.f80(x86_fp80)
-declare fp128 @llvm.acos.f128(fp128)
-declare ppc_fp128 @llvm.acos.ppcf128(ppc_fp128)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-X64: {{.*}}
+; GISEL-X86: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.asin.ll b/llvm/test/CodeGen/X86/llvm.asin.ll
index 1e047d0..f17e33a 100644
--- a/llvm/test/CodeGen/X86/llvm.asin.ll
+++ b/llvm/test/CodeGen/X86/llvm.asin.ll
@@ -1,70 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-
-define half @use_asinf16(half %a) nounwind {
-; CHECK-LABEL: use_asinf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    callq asinf@PLT
-; CHECK-NEXT:    callq __truncsfhf2@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call half @llvm.asin.f16(half %a)
-  ret half %x
-}
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64
 
 define float @use_asinf32(float %a) nounwind {
-; CHECK-LABEL: use_asinf32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp asinf@PLT # TAILCALL
+; X86-LABEL: use_asinf32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll asinf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_asinf32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp asinf@PLT # TAILCALL
   %x = call float @llvm.asin.f32(float %a)
   ret float %x
 }
 
 define double @use_asinf64(double %a) nounwind {
-; CHECK-LABEL: use_asinf64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp asin@PLT # TAILCALL
+; X86-LABEL: use_asinf64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll asin
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_asinf64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp asin@PLT # TAILCALL
   %x = call double @llvm.asin.f64(double %a)
   ret double %x
 }
 
 define x86_fp80 @use_asinf80(x86_fp80 %a) nounwind {
-; CHECK-LABEL: use_asinf80:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    fldt 32(%rsp)
-; CHECK-NEXT:    fstpt (%rsp)
-; CHECK-NEXT:    callq asinl@PLT
-; CHECK-NEXT:    addq  $24, %rsp
-; CHECK-NEXT:    retq
+; X86-LABEL: use_asinf80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll asinl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_asinf80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq asinl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
   %x = call x86_fp80 @llvm.asin.f80(x86_fp80 %a)
   ret x86_fp80 %x
 }
 
-define fp128 @use_asinfp128(fp128 %a) nounwind {
-; CHECK-LABEL: use_asinfp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp  asinf128@PLT # TAILCALL
-  %x = call fp128 @llvm.asin.f128(fp128 %a)
-  ret fp128 %x
-}
-
-define ppc_fp128 @use_asinppc_fp128(ppc_fp128 %a) nounwind {
-; CHECK-LABEL: use_asinppc_fp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq asinl@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call ppc_fp128 @llvm.asin.ppcf128(ppc_fp128 %a)
-  ret ppc_fp128 %x
-}
-
-declare half @llvm.asin.f16(half)
 declare float @llvm.asin.f32(float)
 declare double @llvm.asin.f64(double)
 declare x86_fp80 @llvm.asin.f80(x86_fp80)
-declare fp128 @llvm.asin.f128(fp128)
-declare ppc_fp128 @llvm.asin.ppcf128(ppc_fp128)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-X64: {{.*}}
+; GISEL-X86: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.atan.ll b/llvm/test/CodeGen/X86/llvm.atan.ll
index d33ef7f..013b5e7 100644
--- a/llvm/test/CodeGen/X86/llvm.atan.ll
+++ b/llvm/test/CodeGen/X86/llvm.atan.ll
@@ -1,70 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-
-define half @use_atanf16(half %a) nounwind {
-; CHECK-LABEL: use_atanf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    callq atanf@PLT
-; CHECK-NEXT:    callq __truncsfhf2@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call half @llvm.atan.f16(half %a)
-  ret half %x
-}
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64
 
 define float @use_atanf32(float %a) nounwind {
-; CHECK-LABEL: use_atanf32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp atanf@PLT # TAILCALL
+; X86-LABEL: use_atanf32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll atanf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_atanf32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp atanf@PLT # TAILCALL
   %x = call float @llvm.atan.f32(float %a)
   ret float %x
 }
 
 define double @use_atanf64(double %a) nounwind {
-; CHECK-LABEL: use_atanf64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp atan@PLT # TAILCALL
+; X86-LABEL: use_atanf64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll atan
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_atanf64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp atan@PLT # TAILCALL
   %x = call double @llvm.atan.f64(double %a)
   ret double %x
 }
 
 define x86_fp80 @use_atanf80(x86_fp80 %a) nounwind {
-; CHECK-LABEL: use_atanf80:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    fldt 32(%rsp)
-; CHECK-NEXT:    fstpt (%rsp)
-; CHECK-NEXT:    callq atanl@PLT
-; CHECK-NEXT:    addq  $24, %rsp
-; CHECK-NEXT:    retq
+; X86-LABEL: use_atanf80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll atanl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_atanf80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq atanl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
   %x = call x86_fp80 @llvm.atan.f80(x86_fp80 %a)
   ret x86_fp80 %x
 }
 
-define fp128 @use_atanfp128(fp128 %a) nounwind {
-; CHECK-LABEL: use_atanfp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp  atanf128@PLT # TAILCALL
-  %x = call fp128 @llvm.atan.f128(fp128 %a)
-  ret fp128 %x
-}
-
-define ppc_fp128 @use_atanppc_fp128(ppc_fp128 %a) nounwind {
-; CHECK-LABEL: use_atanppc_fp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq atanl@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call ppc_fp128 @llvm.atan.ppcf128(ppc_fp128 %a)
-  ret ppc_fp128 %x
-}
-
-declare half @llvm.atan.f16(half)
 declare float @llvm.atan.f32(float)
 declare double @llvm.atan.f64(double)
 declare x86_fp80 @llvm.atan.f80(x86_fp80)
-declare fp128 @llvm.atan.f128(fp128)
-declare ppc_fp128 @llvm.atan.ppcf128(ppc_fp128)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-X64: {{.*}}
+; GISEL-X86: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.atan2.ll b/llvm/test/CodeGen/X86/llvm.atan2.ll
index ef2e4be..b72cb10 100644
--- a/llvm/test/CodeGen/X86/llvm.atan2.ll
+++ b/llvm/test/CodeGen/X86/llvm.atan2.ll
@@ -1,80 +1,78 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-
-define half @use_atan2f16(half %a, half %b) nounwind {
-; CHECK-LABEL: use_atan2f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
-; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
-; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq atan2f@PLT
-; CHECK-NEXT:    callq __truncsfhf2@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call half @llvm.atan2.f16(half %a, half %b)
-  ret half %x
-}
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64
 
 define float @use_atan2f32(float %a, float %b) nounwind {
-; CHECK-LABEL: use_atan2f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp atan2f@PLT # TAILCALL
+; X86-LABEL: use_atan2f32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll atan2f
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_atan2f32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp atan2f@PLT # TAILCALL
   %x = call float @llvm.atan2.f32(float %a, float %b)
   ret float %x
 }
 
 define double @use_atan2f64(double %a, double %b) nounwind {
-; CHECK-LABEL: use_atan2f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp atan2@PLT # TAILCALL
+; X86-LABEL: use_atan2f64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll atan2
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_atan2f64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp atan2@PLT # TAILCALL
   %x = call double @llvm.atan2.f64(double %a, double %b)
   ret double %x
 }
 
 define x86_fp80 @use_atan2f80(x86_fp80 %a, x86_fp80 %b) nounwind {
-; CHECK-LABEL: use_atan2f80:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $40, %rsp
-; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    fstpt (%rsp)
-; CHECK-NEXT:    callq atan2l@PLT
-; CHECK-NEXT:    addq $40, %rsp
-; CHECK-NEXT:    retq
+; X86-LABEL: use_atan2f80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll atan2l
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_atan2f80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq atan2l@PLT
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    retq
   %x = call x86_fp80 @llvm.atan2.f80(x86_fp80 %a, x86_fp80 %b)
   ret x86_fp80 %x
 }
 
-define fp128 @use_atan2fp128(fp128 %a, fp128 %b) nounwind {
-; CHECK-LABEL: use_atan2fp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp atan2f128@PLT # TAILCALL
-  %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b)
-  ret fp128 %x
-}
-
-define ppc_fp128 @use_atan2ppc_fp128(ppc_fp128 %a, ppc_fp128 %b) nounwind {
-; CHECK-LABEL: use_atan2ppc_fp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq atan2l@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128 %a, ppc_fp128 %b)
-  ret ppc_fp128 %x
-}
-
-declare half @llvm.atan2.f16(half, half)
 declare float @llvm.atan2.f32(float, float)
 declare double @llvm.atan2.f64(double, double)
 declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80)
-declare fp128 @llvm.atan2.f128(fp128, fp128)
-declare ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128, ppc_fp128)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-X64: {{.*}}
+; GISEL-X86: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.cos.ll b/llvm/test/CodeGen/X86/llvm.cos.ll
new file mode 100644
index 0000000..35c22b8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llvm.cos.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X64
+
+define float @test_cos_f32(float %Val) nounwind {
+; X86-LABEL: test_cos_f32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll cosf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_cos_f32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp cosf@PLT # TAILCALL
+;
+; GISEL-X64-LABEL: test_cos_f32:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    jmp cosf@PLT # TAILCALL
+  %res = call float @llvm.cos.f32(float %Val)
+  ret float %res
+}
+
+define double @test_cos_f64(double %Val) nounwind {
+; X86-LABEL: test_cos_f64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll cos
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_cos_f64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp cos@PLT # TAILCALL
+;
+; GISEL-X64-LABEL: test_cos_f64:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    jmp cos@PLT # TAILCALL
+  %res = call double @llvm.cos.f64(double %Val)
+  ret double %res
+}
+
+define x86_fp80 @test_cos_f80(x86_fp80 %Val) nounwind {
+; X86-LABEL: test_cos_f80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll cosl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_cos_f80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq cosl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: test_cos_f80:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    subq $24, %rsp
+; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    fstpt (%rsp)
+; GISEL-X64-NEXT:    callq cosl@PLT
+; GISEL-X64-NEXT:    addq $24, %rsp
+; GISEL-X64-NEXT:    retq
+  %res = call x86_fp80 @llvm.cos.f80(x86_fp80 %Val)
+  ret x86_fp80 %res
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-X86: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.cosh.ll b/llvm/test/CodeGen/X86/llvm.cosh.ll
index 5e7582c..5f0c2f1 100644
--- a/llvm/test/CodeGen/X86/llvm.cosh.ll
+++ b/llvm/test/CodeGen/X86/llvm.cosh.ll
@@ -1,70 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-
-define half @use_coshf16(half %a) nounwind {
-; CHECK-LABEL: use_coshf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    callq coshf@PLT
-; CHECK-NEXT:    callq __truncsfhf2@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call half @llvm.cosh.f16(half %a)
-  ret half %x
-}
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64
 
 define float @use_coshf32(float %a) nounwind {
-; CHECK-LABEL: use_coshf32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp coshf@PLT # TAILCALL
+; X86-LABEL: use_coshf32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll coshf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_coshf32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp coshf@PLT # TAILCALL
   %x = call float @llvm.cosh.f32(float %a)
   ret float %x
 }
 
 define double @use_coshf64(double %a) nounwind {
-; CHECK-LABEL: use_coshf64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp cosh@PLT # TAILCALL
+; X86-LABEL: use_coshf64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll cosh
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_coshf64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp cosh@PLT # TAILCALL
   %x = call double @llvm.cosh.f64(double %a)
   ret double %x
 }
 
 define x86_fp80 @use_coshf80(x86_fp80 %a) nounwind {
-; CHECK-LABEL: use_coshf80:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    fldt 32(%rsp)
-; CHECK-NEXT:    fstpt (%rsp)
-; CHECK-NEXT:    callq coshl@PLT
-; CHECK-NEXT:    addq  $24, %rsp
-; CHECK-NEXT:    retq
+; X86-LABEL: use_coshf80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll coshl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_coshf80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq coshl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
   %x = call x86_fp80 @llvm.cosh.f80(x86_fp80 %a)
   ret x86_fp80 %x
 }
 
-define fp128 @use_coshfp128(fp128 %a) nounwind {
-; CHECK-LABEL: use_coshfp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp  coshf128@PLT # TAILCALL
-  %x = call fp128 @llvm.cosh.f128(fp128 %a)
-  ret fp128 %x
-}
-
-define ppc_fp128 @use_coshppc_fp128(ppc_fp128 %a) nounwind {
-; CHECK-LABEL: use_coshppc_fp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq coshl@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call ppc_fp128 @llvm.cosh.ppcf128(ppc_fp128 %a)
-  ret ppc_fp128 %x
-}
-
-declare half @llvm.cosh.f16(half)
 declare float @llvm.cosh.f32(float)
 declare double @llvm.cosh.f64(double)
 declare x86_fp80 @llvm.cosh.f80(x86_fp80)
-declare fp128 @llvm.cosh.f128(fp128)
-declare ppc_fp128 @llvm.cosh.ppcf128(ppc_fp128)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-X64: {{.*}}
+; GISEL-X86: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.sin.ll b/llvm/test/CodeGen/X86/llvm.sin.ll
new file mode 100644
index 0000000..1d2dcef
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llvm.sin.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X64
+
+define float @test_sin_f32(float %Val) nounwind {
+; X86-LABEL: test_sin_f32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sinf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_sin_f32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp sinf@PLT # TAILCALL
+;
+; GISEL-X64-LABEL: test_sin_f32:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    jmp sinf@PLT # TAILCALL
+  %res = call float @llvm.sin.f32(float %Val)
+  ret float %res
+}
+
+define double @test_sin_f64(double %Val) nounwind {
+; X86-LABEL: test_sin_f64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll sin
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_sin_f64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp sin@PLT # TAILCALL
+;
+; GISEL-X64-LABEL: test_sin_f64:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    jmp sin@PLT # TAILCALL
+  %res = call double @llvm.sin.f64(double %Val)
+  ret double %res
+}
+
+define x86_fp80 @test_sin_f80(x86_fp80 %Val) nounwind {
+; X86-LABEL: test_sin_f80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll sinl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_sin_f80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq sinl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: test_sin_f80:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    subq $24, %rsp
+; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    fstpt (%rsp)
+; GISEL-X64-NEXT:    callq sinl@PLT
+; GISEL-X64-NEXT:    addq $24, %rsp
+; GISEL-X64-NEXT:    retq
+  %res = call x86_fp80 @llvm.sin.f80(x86_fp80 %Val)
+  ret x86_fp80 %res
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-X86: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.sinh.ll b/llvm/test/CodeGen/X86/llvm.sinh.ll
index ba22842..de8a710 100644
--- a/llvm/test/CodeGen/X86/llvm.sinh.ll
+++ b/llvm/test/CodeGen/X86/llvm.sinh.ll
@@ -1,70 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-
-define half @use_sinhf16(half %a) nounwind {
-; CHECK-LABEL: use_sinhf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    callq sinhf@PLT
-; CHECK-NEXT:    callq __truncsfhf2@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call half @llvm.sinh.f16(half %a)
-  ret half %x
-}
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64
 
 define float @use_sinhf32(float %a) nounwind {
-; CHECK-LABEL: use_sinhf32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp sinhf@PLT # TAILCALL
+; X86-LABEL: use_sinhf32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll sinhf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_sinhf32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp sinhf@PLT # TAILCALL
   %x = call float @llvm.sinh.f32(float %a)
   ret float %x
 }
 
 define double @use_sinhf64(double %a) nounwind {
-; CHECK-LABEL: use_sinhf64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp sinh@PLT # TAILCALL
+; X86-LABEL: use_sinhf64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll sinh
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_sinhf64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp sinh@PLT # TAILCALL
   %x = call double @llvm.sinh.f64(double %a)
   ret double %x
 }
 
 define x86_fp80 @use_sinhf80(x86_fp80 %a) nounwind {
-; CHECK-LABEL: use_sinhf80:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    fldt 32(%rsp)
-; CHECK-NEXT:    fstpt (%rsp)
-; CHECK-NEXT:    callq sinhl@PLT
-; CHECK-NEXT:    addq  $24, %rsp
-; CHECK-NEXT:    retq
+; X86-LABEL: use_sinhf80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll sinhl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_sinhf80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq sinhl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
   %x = call x86_fp80 @llvm.sinh.f80(x86_fp80 %a)
   ret x86_fp80 %x
 }
 
-define fp128 @use_sinhfp128(fp128 %a) nounwind {
-; CHECK-LABEL: use_sinhfp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp  sinhf128@PLT # TAILCALL
-  %x = call fp128 @llvm.sinh.f128(fp128 %a)
-  ret fp128 %x
-}
-
-define ppc_fp128 @use_sinhppc_fp128(ppc_fp128 %a) nounwind {
-; CHECK-LABEL: use_sinhppc_fp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq sinhl@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call ppc_fp128 @llvm.sinh.ppcf128(ppc_fp128 %a)
-  ret ppc_fp128 %x
-}
-
-declare half @llvm.sinh.f16(half)
 declare float @llvm.sinh.f32(float)
 declare double @llvm.sinh.f64(double)
 declare x86_fp80 @llvm.sinh.f80(x86_fp80)
-declare fp128 @llvm.sinh.f128(fp128)
-declare ppc_fp128 @llvm.sinh.ppcf128(ppc_fp128)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-X64: {{.*}}
+; GISEL-X86: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.tan.ll b/llvm/test/CodeGen/X86/llvm.tan.ll
index 24b3003..3205551 100644
--- a/llvm/test/CodeGen/X86/llvm.tan.ll
+++ b/llvm/test/CodeGen/X86/llvm.tan.ll
@@ -1,70 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-
-define half @use_tanf16(half %a) nounwind {
-; CHECK-LABEL: use_tanf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    callq tanf@PLT
-; CHECK-NEXT:    callq __truncsfhf2@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call half @llvm.tan.f16(half %a)
-  ret half %x
-}
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64
 
 define float @use_tanf32(float %a) nounwind {
-; CHECK-LABEL: use_tanf32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp tanf@PLT # TAILCALL
+; X86-LABEL: use_tanf32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll tanf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_tanf32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp tanf@PLT # TAILCALL
   %x = call float @llvm.tan.f32(float %a)
   ret float %x
 }
 
 define double @use_tanf64(double %a) nounwind {
-; CHECK-LABEL: use_tanf64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp tan@PLT # TAILCALL
+; X86-LABEL: use_tanf64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll tan
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_tanf64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp tan@PLT # TAILCALL
   %x = call double @llvm.tan.f64(double %a)
   ret double %x
 }
 
 define x86_fp80 @use_tanf80(x86_fp80 %a) nounwind {
-; CHECK-LABEL: use_tanf80:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    fldt 32(%rsp)
-; CHECK-NEXT:    fstpt (%rsp)
-; CHECK-NEXT:    callq tanl@PLT
-; CHECK-NEXT:    addq  $24, %rsp
-; CHECK-NEXT:    retq
+; X86-LABEL: use_tanf80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll tanl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_tanf80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq tanl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
   %x = call x86_fp80 @llvm.tan.f80(x86_fp80 %a)
   ret x86_fp80 %x
 }
 
-define fp128 @use_tanfp128(fp128 %a) nounwind {
-; CHECK-LABEL: use_tanfp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp  tanf128@PLT # TAILCALL
-  %x = call fp128 @llvm.tan.f128(fp128 %a)
-  ret fp128 %x
-}
-
-define ppc_fp128 @use_tanppc_fp128(ppc_fp128 %a) nounwind {
-; CHECK-LABEL: use_tanppc_fp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq tanl@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call ppc_fp128 @llvm.tan.ppcf128(ppc_fp128 %a)
-  ret ppc_fp128 %x
-}
-
-declare half @llvm.tan.f16(half)
 declare float @llvm.tan.f32(float)
 declare double @llvm.tan.f64(double)
 declare x86_fp80 @llvm.tan.f80(x86_fp80)
-declare fp128 @llvm.tan.f128(fp128)
-declare ppc_fp128 @llvm.tan.ppcf128(ppc_fp128)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-X64: {{.*}}
+; GISEL-X86: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.tanh.ll b/llvm/test/CodeGen/X86/llvm.tanh.ll
index 7119c40..6911a68 100644
--- a/llvm/test/CodeGen/X86/llvm.tanh.ll
+++ b/llvm/test/CodeGen/X86/llvm.tanh.ll
@@ -1,70 +1,70 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
-
-define half @use_tanhf16(half %a) nounwind {
-; CHECK-LABEL: use_tanhf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    callq tanhf@PLT
-; CHECK-NEXT:    callq __truncsfhf2@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call half @llvm.tanh.f16(half %a)
-  ret half %x
-}
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64
 
 define float @use_tanhf32(float %a) nounwind {
-; CHECK-LABEL: use_tanhf32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp tanhf@PLT # TAILCALL
+; X86-LABEL: use_tanhf32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll tanhf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_tanhf32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp tanhf@PLT # TAILCALL
   %x = call float @llvm.tanh.f32(float %a)
   ret float %x
 }
 
 define double @use_tanhf64(double %a) nounwind {
-; CHECK-LABEL: use_tanhf64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp tanh@PLT # TAILCALL
+; X86-LABEL: use_tanhf64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll tanh
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_tanhf64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp tanh@PLT # TAILCALL
   %x = call double @llvm.tanh.f64(double %a)
   ret double %x
 }
 
 define x86_fp80 @use_tanhf80(x86_fp80 %a) nounwind {
-; CHECK-LABEL: use_tanhf80:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    fldt 32(%rsp)
-; CHECK-NEXT:    fstpt (%rsp)
-; CHECK-NEXT:    callq tanhl@PLT
-; CHECK-NEXT:    addq  $24, %rsp
-; CHECK-NEXT:    retq
+; X86-LABEL: use_tanhf80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll tanhl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: use_tanhf80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq tanhl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
   %x = call x86_fp80 @llvm.tanh.f80(x86_fp80 %a)
   ret x86_fp80 %x
 }
 
-define fp128 @use_tanhfp128(fp128 %a) nounwind {
-; CHECK-LABEL: use_tanhfp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    jmp  tanhf128@PLT # TAILCALL
-  %x = call fp128 @llvm.tanh.f128(fp128 %a)
-  ret fp128 %x
-}
-
-define ppc_fp128 @use_tanhppc_fp128(ppc_fp128 %a) nounwind {
-; CHECK-LABEL: use_tanhppc_fp128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    callq tanhl@PLT
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    retq
-  %x = call ppc_fp128 @llvm.tanh.ppcf128(ppc_fp128 %a)
-  ret ppc_fp128 %x
-}
-
-declare half @llvm.tanh.f16(half)
 declare float @llvm.tanh.f32(float)
 declare double @llvm.tanh.f64(double)
 declare x86_fp80 @llvm.tanh.f80(x86_fp80)
-declare fp128 @llvm.tanh.f128(fp128)
-declare ppc_fp128 @llvm.tanh.ppcf128(ppc_fp128)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-X64: {{.*}}
+; GISEL-X86: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X86: {{.*}}
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s
deleted file mode 100644
index acbd010..0000000
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-all.s
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
-
-// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128
-// ASM: .aeabi_attribute Tag_PAuth_Platform, 1
-// ASM: .aeabi_attribute Tag_PAuth_Schema, 1
-// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128
-// ASM: .aeabi_attribute Tag_Feature_BTI, 1
-// ASM: .aeabi_attribute Tag_Feature_PAC, 1
-// ASM: .aeabi_attribute Tag_Feature_GCS, 1
-
-// ELF: Hex dump of section '.ARM.attributes':
-// ELF-NEXT: 0x00000000 41190000 00616561 62695f70 61757468 A....aeabi_pauth
-// ELF-NEXT: 0x00000010 61626900 00000101 02012300 00006165 abi.......#...ae
-// ELF-NEXT: 0x00000020 6162695f 66656174 7572655f 616e645f abi_feature_and_
-// ELF-NEXT: 0x00000030 62697473 00010000 01010102 01
-
-
-.aeabi_subsection aeabi_pauthabi, required, uleb128
-.aeabi_attribute Tag_PAuth_Platform, 1
-.aeabi_attribute Tag_PAuth_Schema, 1
-.aeabi_subsection aeabi_feature_and_bits, optional, uleb128
-.aeabi_attribute Tag_Feature_BTI, 1
-.aeabi_attribute Tag_Feature_PAC, 1
-.aeabi_attribute Tag_Feature_GCS, 1
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s b/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s
deleted file mode 100644
index 229033a..0000000
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections.s
+++ /dev/null
@@ -1,51 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
-
-// ASM: .aeabi_subsection	private_subsection_1, optional, uleb128
-// ASM: .aeabi_attribute 12, 257
-// ASM: .aeabi_subsection	private_subsection_2, required, uleb128
-// ASM: .aeabi_attribute 76, 257
-// ASM: .aeabi_subsection	private_subsection_3, optional, ntbs
-// ASM: .aeabi_attribute 34, hello_llvm
-// ASM: .aeabi_subsection	private_subsection_4, required, ntbs
-// ASM: .aeabi_attribute 777, "hello_llvm"
-// ASM: .aeabi_subsection	private_subsection_1, optional, uleb128
-// ASM: .aeabi_attribute 876, 257
-// ASM: .aeabi_subsection	private_subsection_2, required, uleb128
-// ASM: .aeabi_attribute 876, 257
-// ASM: .aeabi_subsection private_subsection_3, optional, ntbs
-// ASM: .aeabi_attribute 876, "hello_llvm"
-// ASM: .aeabi_subsection	private_subsection_4, required, ntbs
-// ASM: .aeabi_attribute 876, hello_llvm
-
-// ELF: Hex dump of section '.ARM.attributes':
-// ELF-NEXT: 0x00000000 41220000 00707269 76617465 5f737562 A"...private_sub
-// ELF-NEXT: 0x00000010 73656374 696f6e5f 31000100 0c8102ec section_1.......
-// ELF-NEXT: 0x00000020 06810222 00000070 72697661 74655f73 ..."...private_s
-// ELF-NEXT: 0x00000030 75627365 6374696f 6e5f3200 00004c81 ubsection_2...L.
-// ELF-NEXT: 0x00000040 02ec0681 02360000 00707269 76617465 .....6...private
-// ELF-NEXT: 0x00000050 5f737562 73656374 696f6e5f 33000101 _subsection_3...
-// ELF-NEXT: 0x00000060 2268656c 6c6f5f6c 6c766d00 ec062268 "hello_llvm..."h
-// ELF-NEXT: 0x00000070 656c6c6f 5f6c6c76 6d220037 00000070 ello_llvm".7...p
-// ELF-NEXT: 0x00000080 72697661 74655f73 75627365 6374696f rivate_subsectio
-// ELF-NEXT: 0x00000090 6e5f3400 00018906 2268656c 6c6f5f6c n_4....."hello_l
-// ELF-NEXT: 0x000000a0 6c766d22 00ec0668 656c6c6f 5f6c6c76 lvm"...hello_llv
-// ELF-NEXT: 0x000000b0 6d00                                m.
-
-
-.aeabi_subsection private_subsection_1, optional, uleb128
-.aeabi_attribute 12, 257
-.aeabi_subsection private_subsection_2, required, uleb128
-.aeabi_attribute 76, 257
-.aeabi_subsection private_subsection_3, optional, ntbs
-.aeabi_attribute 34, hello_llvm
-.aeabi_subsection private_subsection_4, required, ntbs
-.aeabi_attribute 777, "hello_llvm"
-.aeabi_subsection private_subsection_1, optional, uleb128
-.aeabi_attribute 876, 257
-.aeabi_subsection private_subsection_2, required, uleb128
-.aeabi_attribute 876, 257
-.aeabi_subsection private_subsection_3, optional, ntbs
-.aeabi_attribute 876, "hello_llvm"
-.aeabi_subsection private_subsection_4, required, ntbs
-.aeabi_attribute 876, hello_llvm
diff --git a/llvm/test/MC/AArch64/build-attributes-asm-aeabi-aeabi-known.s b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-aeabi-known.s
new file mode 100644
index 0000000..ecd7581
--- /dev/null
+++ b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-aeabi-known.s
@@ -0,0 +1,43 @@
+// RUN: llvm-mc -triple=aarch64 %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple=aarch64 -filetype=obj %s | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
+
+// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128
+// ASM: .aeabi_attribute 1, 7 // Tag_PAuth_Platform
+// ASM: .aeabi_attribute 2, 777 // Tag_PAuth_Schema
+// ASM: .aeabi_attribute 2, 777 // Tag_PAuth_Schema
+// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128
+// ASM: .aeabi_attribute 0, 1 // Tag_Feature_BTI
+// ASM: .aeabi_attribute 1, 1 // Tag_Feature_PAC
+// ASM: .aeabi_attribute 2, 1 // Tag_Feature_GCS
+// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128
+// ASM: .aeabi_attribute 1, 7 // Tag_PAuth_Platform
+// ASM: .aeabi_attribute 2, 777 // Tag_PAuth_Schema
+// ASM: .aeabi_attribute 2, 777 // Tag_PAuth_Schema
+// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128
+// ASM: .aeabi_attribute 0, 1 // Tag_Feature_BTI
+// ASM: .aeabi_attribute 1, 1 // Tag_Feature_PAC
+// ASM: .aeabi_attribute 2, 1 // Tag_Feature_GCS
+
+// ELF: Hex dump of section '.ARM.attributes':
+// ELF-NEXT: 0x00000000 411a0000 00616561 62695f70 61757468 A....aeabi_pauth
+// ELF-NEXT: 0x00000010 61626900 00000107 02890623 00000061 abi........#...a
+// ELF-NEXT: 0x00000020 65616269 5f666561 74757265 5f616e64 eabi_feature_and
+// ELF-NEXT: 0x00000030 5f626974 73000100 00010101 0201     _bits.........
+
+
+.aeabi_subsection aeabi_pauthabi, required, uleb128
+.aeabi_attribute Tag_PAuth_Platform, 7
+.aeabi_attribute Tag_PAuth_Schema, 777
+.aeabi_attribute Tag_PAuth_Schema, 777
+.aeabi_subsection aeabi_feature_and_bits, optional, uleb128
+.aeabi_attribute Tag_Feature_BTI, 1
+.aeabi_attribute Tag_Feature_PAC, 1
+.aeabi_attribute Tag_Feature_GCS, 1
+.aeabi_subsection aeabi_pauthabi, required, uleb128
+.aeabi_attribute 1, 7 // Tag_PAuth_Platform
+.aeabi_attribute 2, 777 // Tag_PAuth_Schema
+.aeabi_attribute 2, 777 // Tag_PAuth_Schema
+.aeabi_subsection aeabi_feature_and_bits, optional, uleb128
+.aeabi_attribute 0, 1 // Tag_Feature_BTI
+.aeabi_attribute 1, 1 // Tag_Feature_PAC
+.aeabi_attribute 2, 1 // Tag_Feature_GCS
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-bti.s b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-bti.s
index 3897fee..3e97acf5a 100644
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-bti.s
+++ b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-bti.s
@@ -1,10 +1,10 @@
-// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple=aarch64 %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple=aarch64 -filetype=obj %s | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
 
 // ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128
-// ASM: .aeabi_attribute Tag_Feature_BTI, 1
-// ASM: .aeabi_attribute Tag_Feature_PAC, 0
-// ASM: .aeabi_attribute Tag_Feature_GCS, 0
+// ASM: .aeabi_attribute 0, 1 // Tag_Feature_BTI
+// ASM: .aeabi_attribute 1, 0 // Tag_Feature_PAC
+// ASM: .aeabi_attribute 2, 0 // Tag_Feature_GCS
 
 // ELF: Hex dump of section '.ARM.attributes':
 // ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-attrs.s b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-err-attrs.s
index ddf8feb..d509974 100644
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-attrs.s
+++ b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-err-attrs.s
@@ -1,5 +1,6 @@
 // RUN: not llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=ERR %s
 
+// Test logic and type mismatch
 .aeabi_attribute Tag_Feature_BTI, 1
 // ERR: error: no active subsection, build attribute can not be added
 // ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, 1
@@ -9,10 +10,6 @@
 // ERR: error: unknown AArch64 build attribute 'Tag_Feature_BTI' for subsection 'aeabi_pauthabi'
 // ERR-NEXT: .aeabi_attribute Tag_Feature_BTI, 1
 
-.aeabi_attribute Tag_PAuth_Platform, 4
-// ERR: error: unknown AArch64 build attributes Value for Tag 'Tag_PAuth_Platform' options are 0|1
-// ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, 4
-
 .aeabi_attribute a, 1
 // ERR: error: unknown AArch64 build attribute 'a' for subsection 'aeabi_pauthabi'
 // ERR-NEXT: .aeabi_attribute a, 1
@@ -25,6 +22,8 @@
 // ERR: error: active subsection type is ULEB128 (unsigned), found NTBS (string)
 // ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform, a
 
+
+// Test syntax errors
 .aeabi_attribute Tag_PAuth_Platform,
 // ERR: error: AArch64 build attributes value not found
 // ERR-NEXT: .aeabi_attribute Tag_PAuth_Platform,
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-headers.s b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-err-headers.s
index 9e6dca3..501958a 100644
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-err-headers.s
+++ b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-err-headers.s
@@ -1,21 +1,6 @@
 // RUN: not llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=ERR %s
 
-.aeabi_subsection aeabi_pauthabi, optional, uleb128
-// ERR: error: aeabi_pauthabi must be marked as required
-// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, optional, uleb128
-
-.aeabi_subsection aeabi_pauthabi, required, ntbs
-// ERR: error: aeabi_pauthabi must be marked as ULEB128
-// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required, ntbs
-
-.aeabi_subsection aeabi_feature_and_bits, required, uleb128
-// ERR: error: aeabi_feature_and_bits must be marked as optional
-// ERR-NEXT: .aeabi_subsection aeabi_feature_and_bits, required, uleb128
-
-.aeabi_subsection aeabi_feature_and_bits, optional, ntbs
-// ERR: error: aeabi_feature_and_bits must be marked as ULEB128
-// ERR-NEXT: .aeabi_subsection aeabi_feature_and_bits, optional, ntbs
-
+// Test syntax errors
 .aeabi_subsection 1, required, uleb128
 // ERR: error: subsection name not found
 // ERR-NEXT: .aeabi_subsection 1, required, uleb128
@@ -25,11 +10,7 @@
 // ERR-NEXT: .aeabi_subsection , required, uleb128
 
 .aeabi_subsection aeabi_pauthabi, a, uleb128
-// ERR: error: unknown AArch64 build attributes optionality, expected required|optional: a
-// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, a, uleb128
-
-.aeabi_subsection aeabi_pauthabi, a, uleb128
-// ERR: error: unknown AArch64 build attributes optionality, expected required|optional: a
+// ERR: error: unknown AArch64 build attributes optionality, expected required|optional
 // ERR-NEXT: .aeabi_subsection aeabi_pauthabi, a, uleb128
 
 .aeabi_subsection aeabi_pauthabi, 1, uleb128
@@ -41,7 +22,7 @@
 // ERR-NEXT: .aeabi_subsection aeabi_pauthabi, ,uleb128
 
 .aeabi_subsection aeabi_pauthabi,uleb128
-// ERR: error: unknown AArch64 build attributes optionality, expected required|optional: uleb128
+// ERR: error: unknown AArch64 build attributes optionality, expected required|optional
 // ERR-NEXT: .aeabi_subsection aeabi_pauthabi,uleb128
 
 .aeabi_subsection aeabi_pauthabi uleb128
@@ -57,5 +38,27 @@
 // ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required,
 
 .aeabi_subsection aeabi_pauthabi, required, a
-// ERR: error: unknown AArch64 build attributes type, expected uleb128|ntbs: a
+// ERR: error: unknown AArch64 build attributes type, expected uleb128|ntbs
 // ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required, a
+
+.aeabi_subsection aeabi_pauthabi, optional, uleb128
+// ERR: error: aeabi_pauthabi must be marked as required
+// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, optional, uleb128
+
+
+// Test types mismatch
+.aeabi_subsection aeabi_pauthabi, optional, uleb128
+// ERR: error: aeabi_pauthabi must be marked as required
+// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, optional, uleb128
+
+.aeabi_subsection aeabi_pauthabi, required, ntbs
+// ERR: error: aeabi_pauthabi must be marked as ULEB128
+// ERR-NEXT: .aeabi_subsection aeabi_pauthabi, required, ntbs
+
+.aeabi_subsection aeabi_feature_and_bits, required, uleb128
+// ERR: error: aeabi_feature_and_bits must be marked as optional
+// ERR-NEXT: .aeabi_subsection aeabi_feature_and_bits, required, uleb128
+
+.aeabi_subsection aeabi_feature_and_bits, optional, ntbs
+// ERR: error: aeabi_feature_and_bits must be marked as ULEB128
+// ERR-NEXT: .aeabi_subsection aeabi_feature_and_bits, optional, ntbs
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-gcs.s b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-gcs.s
index 5cb7e68..177d249 100644
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-gcs.s
+++ b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-gcs.s
@@ -1,10 +1,10 @@
-// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple=aarch64 %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple=aarch64 -filetype=obj %s | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
 
 // ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128
-// ASM: .aeabi_attribute Tag_Feature_BTI, 0
-// ASM: .aeabi_attribute Tag_Feature_PAC, 0
-// ASM: .aeabi_attribute Tag_Feature_GCS, 1
+// ASM: .aeabi_attribute 0, 0 // Tag_Feature_BTI
+// ASM: .aeabi_attribute 1, 0 // Tag_Feature_PAC
+// ASM: .aeabi_attribute 2, 1 // Tag_Feature_GCS
 
 // ELF: Hex dump of section '.ARM.attributes':
 // ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu
diff --git a/llvm/test/MC/AArch64/build-attributes-asm-aeabi-mixed.s b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-mixed.s
new file mode 100644
index 0000000..96bb59b
--- /dev/null
+++ b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-mixed.s
@@ -0,0 +1,50 @@
+// RUN: llvm-mc -triple=aarch64 %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple=aarch64 -filetype=obj %s | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
+
+// ASM: .aeabi_subsection	subsection_a, optional, uleb128
+// ASM: .aeabi_subsection	aeabi_subsection, optional, ntbs
+// ASM: .aeabi_subsection	subsection_b, required, uleb128
+// ASM: .aeabi_subsection	aeabi_pauthabi, required, uleb128
+// ASM: .aeabi_attribute	1, 7 // Tag_PAuth_Platform
+// ASM: .aeabi_attribute	2, 777 // Tag_PAuth_Schema
+// ASM: .aeabi_attribute	1, 9	// Tag_PAuth_Platform
+// ASM: .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
+// ASM: .aeabi_attribute    0, 1 // Tag_Feature_BTI
+// ASM: .aeabi_attribute    1, 1 // Tag_Feature_PAC
+// ASM: .aeabi_attribute    2, 1 // Tag_Feature_GCS
+// ASM: .aeabi_subsection	aeabi_subsection, optional, ntbs
+// ASM: .aeabi_attribute	5, "Value"
+// ASM: .aeabi_subsection	subsection_b, required, uleb128
+// ASM: .aeabi_attribute	6, 536
+// ASM: .aeabi_subsection	subsection_a, optional, uleb128
+// ASM: .aeabi_attribute	7, 11
+
+// ELF: Hex dump of section '.ARM.attributes':
+// ELF-NEXT: 0x00000000 41150000 00737562 73656374 696f6e5f A....subsection_
+// ELF-NEXT: 0x00000010 61000100 070b2000 00006165 6162695f a..... ...aeabi_
+// ELF-NEXT: 0x00000020 73756273 65637469 6f6e0001 01052256 subsection...."V
+// ELF-NEXT: 0x00000030 616c7565 22001600 00007375 62736563 alue".....subsec
+// ELF-NEXT: 0x00000040 74696f6e 5f620000 00069804 1a000000 tion_b..........
+// ELF-NEXT: 0x00000050 61656162 695f7061 75746861 62690000 aeabi_pauthabi..
+// ELF-NEXT: 0x00000060 00010902 89062300 00006165 6162695f ......#...aeabi_
+// ELF-NEXT: 0x00000070 66656174 7572655f 616e645f 62697473 feature_and_bits
+// ELF-NEXT: 0x00000080 00010000 01010102 01                .........
+
+
+.aeabi_subsection subsection_a, optional, uleb128
+.aeabi_subsection aeabi_subsection, optional, ntbs
+.aeabi_subsection subsection_b, required, uleb128
+.aeabi_subsection aeabi_pauthabi, required, uleb128
+.aeabi_attribute Tag_PAuth_Platform, 7
+.aeabi_attribute Tag_PAuth_Schema, 777
+.aeabi_attribute Tag_PAuth_Platform, 9
+.aeabi_subsection aeabi_feature_and_bits, optional, uleb128
+.aeabi_attribute Tag_Feature_BTI, 1
+.aeabi_attribute Tag_Feature_PAC, 1
+.aeabi_attribute Tag_Feature_GCS, 1
+.aeabi_subsection aeabi_subsection, optional, ntbs
+.aeabi_attribute 5, "Value"
+.aeabi_subsection subsection_b, required, uleb128
+.aeabi_attribute 6, 536
+.aeabi_subsection subsection_a, optional, uleb128
+.aeabi_attribute 7, 11
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-none.s b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-none.s
index a3cbbe2..0b60b8c 100644
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-none.s
+++ b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-none.s
@@ -1,13 +1,13 @@
 // RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM
 // RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
 
-// ASM: .aeabi_subsection aeabi_pauthabi, required, uleb128
-// ASM: .aeabi_attribute Tag_PAuth_Platform, 0
-// ASM: .aeabi_attribute Tag_PAuth_Schema, 0
-// ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128
-// ASM: .aeabi_attribute Tag_Feature_BTI, 0
-// ASM: .aeabi_attribute Tag_Feature_PAC, 0
-// ASM: .aeabi_attribute Tag_Feature_GCS, 0
+// ASM: .aeabi_subsection   aeabi_pauthabi, required, uleb128
+// ASM: .aeabi_attribute	1, 0 // Tag_PAuth_Platform
+// ASM: .aeabi_attribute	2, 0 // Tag_PAuth_Schema
+// ASM: .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
+// ASM: .aeabi_attribute    0, 0 // Tag_Feature_BTI
+// ASM: .aeabi_attribute    1, 0 // Tag_Feature_PAC
+// ASM: .aeabi_attribute    2, 0 // Tag_Feature_GCS
 
 // ELF: Hex dump of section '.ARM.attributes':
 // ELF-NEXT: 0x00000000 41190000 00616561 62695f70 61757468 A....aeabi_pauth
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-numerical-tags.s b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-numerical-tags.s
index 047939d..25b6a18 100644
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-numerical-tags.s
+++ b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-numerical-tags.s
@@ -2,15 +2,15 @@
 
 // ASM: .aeabi_subsection	aeabi_pauthabi, required, uleb128
 // ASM: .aeabi_attribute	0, 1
-// ASM: .aeabi_attribute	Tag_PAuth_Platform, 1
-// ASM: .aeabi_attribute	Tag_PAuth_Schema, 1
+// ASM: .aeabi_attribute	1, 1 // Tag_PAuth_Platform
+// ASM: .aeabi_attribute	2, 1 // Tag_PAuth_Schema
 // ASM: .aeabi_attribute	3, 1
 // ASM: .aeabi_attribute	4, 1
 // ASM: .aeabi_attribute	5, 1
 // ASM: .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
-// ASM: .aeabi_attribute	Tag_Feature_BTI, 1
-// ASM: .aeabi_attribute	Tag_Feature_PAC, 1
-// ASM: .aeabi_attribute	Tag_Feature_GCS, 1
+// ASM: .aeabi_attribute	0, 1 // Tag_Feature_BTI
+// ASM: .aeabi_attribute	1, 1 // Tag_Feature_PAC
+// ASM: .aeabi_attribute	2, 1 // Tag_Feature_GCS
 // ASM: .aeabi_attribute	3, 1
 // ASM: .aeabi_attribute	4, 1
 // ASM: .aeabi_attribute	5, 1
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-out-of-order.s b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-out-of-order.s
index 2d5d425..c4192cd 100644
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-out-of-order.s
+++ b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-out-of-order.s
@@ -3,18 +3,18 @@
 
 // ASM: .aeabi_subsection	aeabi_pauthabi, required, uleb128
 // ASM: .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
-// ASM: .aeabi_attribute	Tag_Feature_BTI, 1
+// ASM: .aeabi_attribute	0, 1 // Tag_Feature_BTI
 // ASM: .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
 // ASM: .aeabi_subsection	aeabi_pauthabi, required, uleb128
-// ASM: .aeabi_attribute	Tag_PAuth_Schema, 1
+// ASM: .aeabi_attribute	2, 1 // Tag_PAuth_Schema
 // ASM: .aeabi_subsection	aeabi_pauthabi, required, uleb128
-// ASM: .aeabi_attribute	Tag_PAuth_Platform, 1
+// ASM: .aeabi_attribute	1, 1 // Tag_PAuth_Platform
 // ASM: .aeabi_subsection	aeabi_pauthabi, required, uleb128
 // ASM: .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
-// ASM: .aeabi_attribute	Tag_Feature_GCS, 1
+// ASM: .aeabi_attribute	2, 1 // Tag_Feature_GCS
 // ASM: .aeabi_subsection	aeabi_pauthabi, required, uleb128
 // ASM: .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
-// ASM: .aeabi_attribute	Tag_Feature_PAC, 0
+// ASM: .aeabi_attribute	1, 0 // Tag_Feature_PAC
 // ASM: .aeabi_subsection	aeabi_feature_and_bits, optional, uleb128
 // ASM: .aeabi_attribute	7, 1
 // ASM: .aeabi_subsection	aeabi_pauthabi, required, uleb128
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-pac.s b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-pac.s
index e3191ac..228e2be 100644
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-pac.s
+++ b/llvm/test/MC/AArch64/build-attributes-asm-aeabi-pac.s
@@ -2,9 +2,9 @@
 // RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
 
 // ASM: .aeabi_subsection aeabi_feature_and_bits, optional, uleb128
-// ASM: .aeabi_attribute Tag_Feature_BTI, 0
-// ASM: .aeabi_attribute Tag_Feature_PAC, 1
-// ASM: .aeabi_attribute Tag_Feature_GCS, 0
+// ASM: .aeabi_attribute 0, 0 // Tag_Feature_BTI
+// ASM: .aeabi_attribute 1, 1 // Tag_Feature_PAC
+// ASM: .aeabi_attribute 2, 0 // Tag_Feature_GCS
 
 // ELF: Hex dump of section '.ARM.attributes':
 // ELF-NEXT: 0x00000000 41230000 00616561 62695f66 65617475 A#...aeabi_featu
@@ -13,6 +13,6 @@
 
 
 .aeabi_subsection aeabi_feature_and_bits, optional, uleb128
-.aeabi_attribute Tag_Feature_BTI, 0
-.aeabi_attribute Tag_Feature_PAC, 1
-.aeabi_attribute Tag_Feature_GCS, 0
+.aeabi_attribute 0, 0 // Tag_Feature_BTI
+.aeabi_attribute 1, 1 // Tag_Feature_PAC
+.aeabi_attribute 2, 0 // Tag_Feature_GCS
diff --git a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections-err.s b/llvm/test/MC/AArch64/build-attributes-asm-non_aeabi-err.s
index 5884a74..6e28481 100644
--- a/llvm/test/MC/AArch64/aarch64-build-attributes-asm-private-subsections-err.s
+++ b/llvm/test/MC/AArch64/build-attributes-asm-non_aeabi-err.s
@@ -15,6 +15,11 @@
 // ERR: error: active subsection type is NTBS (string), found ULEB128 (unsigned)
 // ERR-NEXT: .aeabi_attribute 324, 1
 
+.aeabi_attribute str_not_int, "1"
+// ERR: error: unrecognized Tag: 'str_not_int'
+// ERR-NEXT: Except for public subsections, tags have to be an unsigned int.
+// ERR-NEXT: .aeabi_attribute str_not_int, "1"
+
 .aeabi_subsection foo, optional, uleb128
 .aeabi_subsection bar, optional, uleb128
 .aeabi_subsection foo, required, uleb128
diff --git a/llvm/test/MC/AArch64/build-attributes-asm-non_aeabi.s b/llvm/test/MC/AArch64/build-attributes-asm-non_aeabi.s
new file mode 100644
index 0000000..ef55a3c
--- /dev/null
+++ b/llvm/test/MC/AArch64/build-attributes-asm-non_aeabi.s
@@ -0,0 +1,49 @@
+// RUN: llvm-mc -triple=aarch64 %s -o - | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -triple=aarch64 -filetype=obj %s -o - | llvm-readelf --hex-dump=.ARM.attributes - | FileCheck %s --check-prefix=ELF
+
+// ASM: .aeabi_subsection	private_subsection_1, optional, uleb128
+// ASM: .aeabi_attribute 12, 257
+// ASM: .aeabi_subsection	aeabi_2, required, uleb128
+// ASM: .aeabi_attribute 76, 257
+// ASM: .aeabi_subsection	aeabi_3, optional, ntbs
+// ASM: .aeabi_attribute 34, hello_llvm
+// ASM: .aeabi_subsection	private_subsection_4, required, ntbs
+// ASM: .aeabi_attribute 777, "hello_llvm"
+// ASM: .aeabi_subsection	private_subsection_1, optional, uleb128
+// ASM: .aeabi_attribute 876, 257
+// ASM: .aeabi_subsection	aeabi_2, required, uleb128
+// ASM: .aeabi_attribute 876, 257
+// ASM: .aeabi_subsection aeabi_3, optional, ntbs
+// ASM: .aeabi_attribute 876, "hello_llvm"
+// ASM: .aeabi_subsection	private_subsection_4, required, ntbs
+// ASM: .aeabi_attribute 876, hello_llvm
+
+// ELF: Hex dump of section '.ARM.attributes':
+// ELF: 0x00000000 41220000 00707269 76617465 5f737562 A"...private_sub
+// ELF: 0x00000010 73656374 696f6e5f 31000100 0c8102ec section_1.......
+// ELF: 0x00000020 06810215 00000061 65616269 5f320000 .......aeabi_2..
+// ELF: 0x00000030 004c8102 ec068102 29000000 61656162 .L......)...aeab
+// ELF: 0x00000040 695f3300 01012268 656c6c6f 5f6c6c76 i_3..."hello_llv
+// ELF: 0x00000050 6d00ec06 2268656c 6c6f5f6c 6c766d22 m..."hello_llvm"
+// ELF: 0x00000060 00370000 00707269 76617465 5f737562 .7...private_sub
+// ELF: 0x00000070 73656374 696f6e5f 34000001 89062268 section_4....."h
+// ELF: 0x00000080 656c6c6f 5f6c6c76 6d2200ec 0668656c ello_llvm"...hel
+// ELF: 0x00000090 6c6f5f6c 6c766d00                   lo_llvm.
+
+
+.aeabi_subsection private_subsection_1, optional, uleb128
+.aeabi_attribute 12, 257
+.aeabi_subsection aeabi_2, required, uleb128
+.aeabi_attribute 76, 257
+.aeabi_subsection aeabi_3, optional, ntbs
+.aeabi_attribute 34, hello_llvm
+.aeabi_subsection private_subsection_4, required, ntbs
+.aeabi_attribute 777, "hello_llvm"
+.aeabi_subsection private_subsection_1, optional, uleb128
+.aeabi_attribute 876, 257
+.aeabi_subsection aeabi_2, required, uleb128
+.aeabi_attribute 876, 257
+.aeabi_subsection aeabi_3, optional, ntbs
+.aeabi_attribute 876, "hello_llvm"
+.aeabi_subsection private_subsection_4, required, ntbs
+.aeabi_attribute 876, hello_llvm
diff --git a/llvm/test/MC/Disassembler/RISCV/xqci-invalid.txt b/llvm/test/MC/Disassembler/RISCV/xqci-invalid.txt
new file mode 100644
index 0000000..da0c485
--- /dev/null
+++ b/llvm/test/MC/Disassembler/RISCV/xqci-invalid.txt
@@ -0,0 +1,10 @@
+# RUN: not llvm-mc -disassemble -triple=riscv32 -mattr=+experimental-xqciac %s | FileCheck %s
+
+[0x00,0x00]
+# CHECK: unimp
+
+[0x8b,0x30,0x31,0x46]
+# CHECK-NOT: qc.shladd x1, x2, x3, {{[0-9]+}}
+
+[0x00,0x00]
+# CHECK: unimp
diff --git a/llvm/test/MC/RISCV/xrivosvizip-invalid.s b/llvm/test/MC/RISCV/xrivosvizip-invalid.s
index 5a36c77..6d87378 100644
--- a/llvm/test/MC/RISCV/xrivosvizip-invalid.s
+++ b/llvm/test/MC/RISCV/xrivosvizip-invalid.s
@@ -3,8 +3,8 @@
 
 # Disallowed source/dest overlap cases
 # CHECK:  error: the destination vector register group cannot overlap the source vector register group
-rv.vzipeven.vv v2, v2, v3
+ri.vzipeven.vv v2, v2, v3
 # CHECK:  error: the destination vector register group cannot overlap the source vector register group
-rv.vzipeven.vv v3, v2, v3
+ri.vzipeven.vv v3, v2, v3
 # CHECK: error: the destination vector register group cannot overlap the mask register
-rv.vzipeven.vv v0, v2, v3, v0.t
+ri.vzipeven.vv v0, v2, v3, v0.t
diff --git a/llvm/test/MC/RISCV/xrivosvizip-valid.s b/llvm/test/MC/RISCV/xrivosvizip-valid.s
index 1447e0a9..cab1ebb 100644
--- a/llvm/test/MC/RISCV/xrivosvizip-valid.s
+++ b/llvm/test/MC/RISCV/xrivosvizip-valid.s
@@ -9,51 +9,51 @@
 # RUN:     | llvm-objdump --mattr=+experimental-xrivosvizip -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 
-# CHECK-ASM-AND-OBJ: rv.vzipeven.vv    v1, v2, v3
+# CHECK-ASM-AND-OBJ: ri.vzipeven.vv    v1, v2, v3
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x32]
-rv.vzipeven.vv v1, v2, v3
-# CHECK-ASM-AND-OBJ: rv.vzipeven.vv    v1, v2, v3, v0.t
+ri.vzipeven.vv v1, v2, v3
+# CHECK-ASM-AND-OBJ: ri.vzipeven.vv    v1, v2, v3, v0.t
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x30]
-rv.vzipeven.vv v1, v2, v3, v0.t
-# CHECK-ASM-AND-OBJ: rv.vzipodd.vv  v1, v2, v3
+ri.vzipeven.vv v1, v2, v3, v0.t
+# CHECK-ASM-AND-OBJ: ri.vzipodd.vv  v1, v2, v3
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x72]
-rv.vzipodd.vv v1, v2, v3
-# CHECK-ASM-AND-OBJ: rv.vzipodd.vv  v1, v2, v3, v0.t
+ri.vzipodd.vv v1, v2, v3
+# CHECK-ASM-AND-OBJ: ri.vzipodd.vv  v1, v2, v3, v0.t
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x70]
-rv.vzipodd.vv v1, v2, v3, v0.t
+ri.vzipodd.vv v1, v2, v3, v0.t
 
-# CHECK-ASM-AND-OBJ:  rv.vzip2a.vv   v1, v2, v3
+# CHECK-ASM-AND-OBJ:  ri.vzip2a.vv   v1, v2, v3
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x12]
-rv.vzip2a.vv v1, v2, v3
-# CHECK-ASM-AND-OBJ:  rv.vzip2a.vv   v1, v2, v3, v0.t
+ri.vzip2a.vv v1, v2, v3
+# CHECK-ASM-AND-OBJ:  ri.vzip2a.vv   v1, v2, v3, v0.t
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x10]
-rv.vzip2a.vv v1, v2, v3, v0.t
-# CHECK-ASM-AND-OBJ: rv.vzip2b.vv   v1, v2, v3
+ri.vzip2a.vv v1, v2, v3, v0.t
+# CHECK-ASM-AND-OBJ: ri.vzip2b.vv   v1, v2, v3
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x52]
-rv.vzip2b.vv v1, v2, v3
-# CHECK-ASM-AND-OBJ: rv.vzip2b.vv   v1, v2, v3, v0.t
+ri.vzip2b.vv v1, v2, v3
+# CHECK-ASM-AND-OBJ: ri.vzip2b.vv   v1, v2, v3, v0.t
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x50]
-rv.vzip2b.vv v1, v2, v3, v0.t
+ri.vzip2b.vv v1, v2, v3, v0.t
 
-# CHECK-ASM-AND-OBJ: rv.vunzip2a.vv v1, v2, v3
+# CHECK-ASM-AND-OBJ: ri.vunzip2a.vv v1, v2, v3
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x22]
-rv.vunzip2a.vv v1, v2, v3
-# CHECK-ASM-AND-OBJ: rv.vunzip2a.vv v1, v2, v3, v0.t
+ri.vunzip2a.vv v1, v2, v3
+# CHECK-ASM-AND-OBJ: ri.vunzip2a.vv v1, v2, v3, v0.t
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x20]
-rv.vunzip2a.vv v1, v2, v3, v0.t
-# CHECK-ASM-AND-OBJ: rv.vunzip2b.vv v1, v2, v3
+ri.vunzip2a.vv v1, v2, v3, v0.t
+# CHECK-ASM-AND-OBJ: ri.vunzip2b.vv v1, v2, v3
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x62]
-rv.vunzip2b.vv v1, v2, v3
-# CHECK-ASM-AND-OBJ: rv.vunzip2b.vv v1, v2, v3, v0.t
+ri.vunzip2b.vv v1, v2, v3
+# CHECK-ASM-AND-OBJ: ri.vunzip2b.vv v1, v2, v3, v0.t
 # CHECK-ASM: encoding: [0xdb,0x80,0x21,0x60]
-rv.vunzip2b.vv v1, v2, v3, v0.t
+ri.vunzip2b.vv v1, v2, v3, v0.t
 
 # Overlap between source registers *is* allowed
 
-# CHECK-ASM-AND-OBJ: rv.vzipeven.vv    v1, v2, v2
+# CHECK-ASM-AND-OBJ: ri.vzipeven.vv    v1, v2, v2
 # CHECK-ASM: encoding: [0xdb,0x00,0x21,0x32]
-rv.vzipeven.vv v1, v2, v2
+ri.vzipeven.vv v1, v2, v2
 
-# CHECK-ASM-AND-OBJ: rv.vzipeven.vv    v1, v2, v0, v0.t
+# CHECK-ASM-AND-OBJ: ri.vzipeven.vv    v1, v2, v0, v0.t
 # CHECK-ASM: encoding: [0xdb,0x00,0x20,0x30]
-rv.vzipeven.vv v1, v2, v0, v0.t
+ri.vzipeven.vv v1, v2, v0, v0.t
diff --git a/llvm/test/Transforms/ConstraintElimination/analysis-invalidation.ll b/llvm/test/Transforms/ConstraintElimination/analysis-invalidation.ll
index 932be12..cb51f8c 100644
--- a/llvm/test/Transforms/ConstraintElimination/analysis-invalidation.ll
+++ b/llvm/test/Transforms/ConstraintElimination/analysis-invalidation.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt -passes='require<demanded-bits>,constraint-elimination,require<demanded-bits>' -disable-verify -verify-analysis-invalidation=false -debug-pass-manager -disable-output %s 2>&1 | FileCheck %s
 
 ; Check that constraint-elimination properly invalidates anlyses.
@@ -34,6 +33,17 @@
 ; CHECK-NEXT: Running pass: RequireAnalysisPass
 ; CHECK-NEXT: Running analysis: DemandedBitsAnalysis on uge_zext
 
+; CHECK-NEXT: Running pass: RequireAnalysisPass
+; CHECK-NEXT: Running analysis: DemandedBitsAnalysis on test_mul_const_nuw_unsigned_14
+; CHECK-NEXT: Running analysis: AssumptionAnalysis on test_mul_const_nuw_unsigned_14
+; CHECK-NEXT: Running analysis: TargetIRAnalysis on test_mul_const_nuw_unsigned_14
+; CHECK-NEXT: Running analysis: DominatorTreeAnalysis on test_mul_const_nuw_unsigned_14
+; CHECK-NEXT: Running pass: ConstraintEliminationPass on test_mul_const_nuw_unsigned_14
+; CHECK-NEXT: Running analysis: LoopAnalysis on test_mul_const_nuw_unsigned_14
+; CHECK-NEXT: Running analysis: ScalarEvolutionAnalysis on test_mul_const_nuw_unsigned_14
+; CHECK-NEXT: Running analysis: TargetLibraryAnalysis on test_mul_const_nuw_unsigned_14
+; CHECK-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis on test_mul_const_nuw_unsigned_14
+
 declare { i8, i1 } @llvm.ssub.with.overflow.i8(i8, i8)
 
 define i8 @ssub_no_overflow_due_to_or_conds(i8 %a, i8 %b) {
@@ -72,5 +82,13 @@ bb1:
 bb2:
   ret i1 false
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
+
+define i1 @test_mul_const_nuw_unsigned_14(i8 %start) {
+  %start.mul.5 = mul nuw i8 %start, -5
+  %c.0 = icmp ult i8 %start, %start.mul.5
+  call void @llvm.assume(i1 %c.0)
+
+  %start.mul.3 = mul nuw i8 %start, -3
+  %t.1 = icmp ule i8 %start.mul.3, %start.mul.5
+  ret i1 %t.1
+}
diff --git a/llvm/test/Transforms/InstCombine/load.ll b/llvm/test/Transforms/InstCombine/load.ll
index a5ad1e0..d10aabf 100644
--- a/llvm/test/Transforms/InstCombine/load.ll
+++ b/llvm/test/Transforms/InstCombine/load.ll
@@ -451,3 +451,33 @@ define i32 @load_select_with_null_gep(i1 %cond, ptr %p, i64 %off) {
   %res = load i32, ptr %gep, align 4
   ret i32 %res
 }
+
+define i16 @load_select_with_null_gep2(i1 %cond, ptr %p, i64 %x) {
+; CHECK-LABEL: @load_select_with_null_gep2(
+; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[SEL:%.*]], i64 -2
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[X:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = load i16, ptr [[GEP]], align 2
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %sel = select i1 %cond, ptr %p, ptr null
+  %invariant.gep = getelementptr i8, ptr %sel, i64 -2
+  %gep = getelementptr i16, ptr %invariant.gep, i64 %x
+  %res = load i16, ptr %gep, align 2
+  ret i16 %res
+}
+
+define i16 @load_select_with_null_gep3(i1 %cond, ptr %p, i64 %x, i64 %y) {
+; CHECK-LABEL: @load_select_with_null_gep3(
+; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[SEL:%.*]], i64 -2
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[X:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i16, ptr [[GEP]], i64 [[Y:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = load i16, ptr [[GEP2]], align 2
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %sel = select i1 %cond, ptr %p, ptr null
+  %invariant.gep = getelementptr i8, ptr %sel, i64 -2
+  %gep = getelementptr i16, ptr %invariant.gep, i64 %x
+  %gep2 = getelementptr i16, ptr %gep, i64 %y
+  %res = load i16, ptr %gep2, align 2
+  ret i16 %res
+}
diff --git a/llvm/test/Transforms/MergeFunc/comdat.ll b/llvm/test/Transforms/MergeFunc/comdat.ll
index f6e1046..3770c77 100644
--- a/llvm/test/Transforms/MergeFunc/comdat.ll
+++ b/llvm/test/Transforms/MergeFunc/comdat.ll
@@ -19,6 +19,7 @@ define linkonce_odr hidden i32 @g(i32 %x, i32 %y) comdat {
   ret i32 %sum3
 }
 
-; CHECK-DAG: define linkonce_odr hidden i32 @f(i32 %x, i32 %y) comdat
-; CHECK-DAG: define linkonce_odr hidden i32 @g(i32 %0, i32 %1) comdat
+; CHECK-DAG: define private i32 @0(i32 %x, i32 %y) comdat($f)
+; CHECK-DAG: define linkonce_odr hidden i32 @g(i32 %0, i32 %1) comdat {
+; CHECK-DAG: define linkonce_odr hidden i32 @f(i32 %0, i32 %1) {
 
diff --git a/llvm/test/Transforms/MergeFunc/linkonce_odr.ll b/llvm/test/Transforms/MergeFunc/linkonce_odr.ll
index 14b56a8..ecbe6f0 100644
--- a/llvm/test/Transforms/MergeFunc/linkonce_odr.ll
+++ b/llvm/test/Transforms/MergeFunc/linkonce_odr.ll
@@ -7,8 +7,6 @@
 ; The problem with this is that the linker could then choose these two stubs
 ; each of the two modules and we end up with two stubs calling each other.
 
-
-
 define linkonce_odr i32 @funC(i32 %x, i32 %y) {
   %sum = add i32 %x, %y
   %sum2 = add i32 %x, %sum
@@ -37,7 +35,7 @@ define linkonce_odr i32 @funA(i32 %x, i32 %y) {
 ;.
 ; CHECK: @take_addr_of_funB = global ptr @funB
 ;.
-; CHECK-LABEL: define linkonce_odr i32 @funA(
+; CHECK-LABEL: define private i32 @0(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[X]], [[Y]]
 ; CHECK-NEXT:    [[SUM2:%.*]] = add i32 [[X]], [[SUM]]
@@ -45,8 +43,14 @@ define linkonce_odr i32 @funA(i32 %x, i32 %y) {
 ; CHECK-NEXT:    ret i32 [[SUM3]]
 ;
 ;
+; CHECK-LABEL: define linkonce_odr i32 @funC(
+; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @[[GLOB0:[0-9]+]](i32 [[TMP0]], i32 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+;
 ; CHECK-LABEL: define linkonce_odr i32 @funB(
 ; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @funA(i32 [[TMP0]], i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @[[GLOB0]](i32 [[TMP0]], i32 [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
diff --git a/llvm/test/Transforms/MergeFunc/merge-linkonce-odr-used.ll b/llvm/test/Transforms/MergeFunc/merge-linkonce-odr-used.ll
index fe6a521..db53a78 100644
--- a/llvm/test/Transforms/MergeFunc/merge-linkonce-odr-used.ll
+++ b/llvm/test/Transforms/MergeFunc/merge-linkonce-odr-used.ll
@@ -41,13 +41,13 @@ declare void @foo(ptr)
 ;.
 ; CHECK-LABEL: define void @caller_of_callers(
 ; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_foo_1(ptr [[P]])
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_foo_1(ptr [[P]])
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_foo_1(ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0:[0-9]+]](ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0]](ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0]](ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define linkonce_odr void @linkonce_odr_caller_of_foo_1(
+; CHECK-LABEL: define private void @0(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    tail call void @foo(ptr [[P]])
@@ -58,12 +58,18 @@ declare void @foo(ptr)
 ;
 ; CHECK-LABEL: define linkonce_odr void @linkonce_odr_caller_of_foo_2(
 ; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    tail call void @linkonce_odr_caller_of_foo_1(ptr [[TMP0]])
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define linkonce_odr void @linkonce_odr_caller_of_foo_1(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define linkonce_odr void @linkonce_odr_caller_of_foo_3(
 ; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    tail call void @linkonce_odr_caller_of_foo_1(ptr [[TMP0]])
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/MergeFunc/merge-linkonce-odr-weak-odr-mixed-used.ll b/llvm/test/Transforms/MergeFunc/merge-linkonce-odr-weak-odr-mixed-used.ll
index 63c6b22..19f98d9 100644
--- a/llvm/test/Transforms/MergeFunc/merge-linkonce-odr-weak-odr-mixed-used.ll
+++ b/llvm/test/Transforms/MergeFunc/merge-linkonce-odr-weak-odr-mixed-used.ll
@@ -41,13 +41,13 @@ declare void @foo(ptr)
 ;.
 ; CHECK-LABEL: define void @caller_of_callers(
 ; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_foo_2(ptr [[P]])
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_foo_2(ptr [[P]])
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_foo_2(ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0:[0-9]+]](ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0]](ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0]](ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define linkonce_odr void @linkonce_odr_caller_of_foo_2(
+; CHECK-LABEL: define private void @0(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    tail call void @foo(ptr [[P]])
@@ -58,12 +58,18 @@ declare void @foo(ptr)
 ;
 ; CHECK-LABEL: define weak_odr void @weak_odr_caller_of_foo_1(
 ; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    tail call void @linkonce_odr_caller_of_foo_2(ptr [[TMP0]])
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define linkonce_odr void @linkonce_odr_caller_of_foo_2(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define weak_odr void @weak_odr_caller_of_foo_3(
 ; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    tail call void @linkonce_odr_caller_of_foo_2(ptr [[TMP0]])
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/MergeFunc/merge-linkonce-odr.ll b/llvm/test/Transforms/MergeFunc/merge-linkonce-odr.ll
index 13d9fe3..7e815e1 100644
--- a/llvm/test/Transforms/MergeFunc/merge-linkonce-odr.ll
+++ b/llvm/test/Transforms/MergeFunc/merge-linkonce-odr.ll
@@ -85,20 +85,21 @@ entry:
 }
 
 declare void @zar(ptr)
+
 ; CHECK-LABEL: define void @caller_of_callers(
 ; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_foo_1(ptr [[P]])
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_foo_1(ptr [[P]])
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_foo_1(ptr [[P]])
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_bar_2(ptr [[P]])
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_bar_2(ptr [[P]])
-; CHECK-NEXT:    call void @linkonce_odr_caller_of_bar_2(ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0:[0-9]+]](ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0]](ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0]](ptr [[P]])
+; CHECK-NEXT:    call void @internal_caller_of_bar_1(ptr [[P]])
+; CHECK-NEXT:    call void @internal_caller_of_bar_1(ptr [[P]])
+; CHECK-NEXT:    call void @internal_caller_of_bar_1(ptr [[P]])
 ; CHECK-NEXT:    call void @hidden_caller_of_zar_1(ptr [[P]])
 ; CHECK-NEXT:    call void @hidden_caller_of_zar_1(ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define linkonce_odr hidden void @linkonce_odr_caller_of_foo_1(
+; CHECK-LABEL: define private void @0(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    tail call void @foo(ptr [[P]])
@@ -107,7 +108,7 @@ declare void @zar(ptr)
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define linkonce_odr hidden void @linkonce_odr_caller_of_bar_2(
+; CHECK-LABEL: define internal void @internal_caller_of_bar_1(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    tail call void @bar(ptr [[P]])
@@ -124,3 +125,15 @@ declare void @zar(ptr)
 ; CHECK-NEXT:    tail call void @zar(ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
+;
+; CHECK-LABEL: define linkonce_odr hidden void @linkonce_odr_caller_of_foo_2(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define linkonce_odr hidden void @linkonce_odr_caller_of_foo_1(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
+; CHECK-NEXT:    ret void
+;
diff --git a/llvm/test/Transforms/MergeFunc/merge-weak-odr-used.ll b/llvm/test/Transforms/MergeFunc/merge-weak-odr-used.ll
index 6c2b22f..601a127 100644
--- a/llvm/test/Transforms/MergeFunc/merge-weak-odr-used.ll
+++ b/llvm/test/Transforms/MergeFunc/merge-weak-odr-used.ll
@@ -41,13 +41,13 @@ declare void @foo(ptr)
 ;.
 ; CHECK-LABEL: define void @caller_of_callers(
 ; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @weak_odr_caller_of_foo_1(ptr [[P]])
-; CHECK-NEXT:    call void @weak_odr_caller_of_foo_1(ptr [[P]])
-; CHECK-NEXT:    call void @weak_odr_caller_of_foo_1(ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0:[0-9]+]](ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0]](ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0]](ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define weak_odr void @weak_odr_caller_of_foo_1(
+; CHECK-LABEL: define private void @0(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    tail call void @foo(ptr [[P]])
@@ -58,12 +58,18 @@ declare void @foo(ptr)
 ;
 ; CHECK-LABEL: define weak_odr void @weak_odr_caller_of_foo_2(
 ; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    tail call void @weak_odr_caller_of_foo_1(ptr [[TMP0]])
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define weak_odr void @weak_odr_caller_of_foo_1(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define weak_odr void @weak_odr_caller_of_foo_3(
 ; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    tail call void @weak_odr_caller_of_foo_1(ptr [[TMP0]])
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/MergeFunc/merge-weak-odr.ll b/llvm/test/Transforms/MergeFunc/merge-weak-odr.ll
index 3ea279a..01acad0 100644
--- a/llvm/test/Transforms/MergeFunc/merge-weak-odr.ll
+++ b/llvm/test/Transforms/MergeFunc/merge-weak-odr.ll
@@ -87,18 +87,18 @@ entry:
 declare void @zar(ptr)
 ; CHECK-LABEL: define void @caller_of_callers(
 ; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @weak_odr_caller_of_foo_1(ptr [[P]])
-; CHECK-NEXT:    call void @weak_odr_caller_of_foo_1(ptr [[P]])
-; CHECK-NEXT:    call void @weak_odr_caller_of_foo_1(ptr [[P]])
-; CHECK-NEXT:    call void @weak_odr_caller_of_bar_2(ptr [[P]])
-; CHECK-NEXT:    call void @weak_odr_caller_of_bar_2(ptr [[P]])
-; CHECK-NEXT:    call void @weak_odr_caller_of_bar_2(ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0:[0-9]+]](ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0]](ptr [[P]])
+; CHECK-NEXT:    call void @[[GLOB0]](ptr [[P]])
+; CHECK-NEXT:    call void @internal_caller_of_bar_1(ptr [[P]])
+; CHECK-NEXT:    call void @internal_caller_of_bar_1(ptr [[P]])
+; CHECK-NEXT:    call void @internal_caller_of_bar_1(ptr [[P]])
 ; CHECK-NEXT:    call void @hidden_caller_of_zar_1(ptr [[P]])
 ; CHECK-NEXT:    call void @hidden_caller_of_zar_1(ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define weak_odr hidden void @weak_odr_caller_of_foo_1(
+; CHECK-LABEL: define private void @0(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    tail call void @foo(ptr [[P]])
@@ -107,7 +107,7 @@ declare void @zar(ptr)
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define weak_odr hidden void @weak_odr_caller_of_bar_2(
+; CHECK-LABEL: define internal void @internal_caller_of_bar_1(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    tail call void @bar(ptr [[P]])
@@ -127,19 +127,31 @@ declare void @zar(ptr)
 ;
 ; CHECK-LABEL: define weak_odr hidden void @weak_odr_caller_of_foo_2(
 ; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    tail call void @weak_odr_caller_of_foo_1(ptr [[TMP0]])
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define weak_odr hidden void @weak_odr_caller_of_foo_1(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define weak_odr hidden void @weak_odr_caller_of_foo_3(
 ; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    tail call void @weak_odr_caller_of_foo_1(ptr [[TMP0]])
+; CHECK-NEXT:    tail call void @[[GLOB0]](ptr [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define weak_odr hidden void @weak_odr_caller_of_bar_2(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    tail call void @internal_caller_of_bar_1(ptr [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define weak_odr hidden void @weak_odr_caller_of_bar_3(
 ; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    tail call void @weak_odr_caller_of_bar_2(ptr [[TMP0]])
+; CHECK-NEXT:    tail call void @internal_caller_of_bar_1(ptr [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/fake-use-considered-when-sinking.ll b/llvm/test/Transforms/SimplifyCFG/X86/fake-use-considered-when-sinking.ll
new file mode 100644
index 0000000..6321731
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/fake-use-considered-when-sinking.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p="simplifycfg<sink-common-insts>" -S < %s | FileCheck %s
+
+;; Verify that fake uses are not ignored when sinking instructions in
+;; SimplifyCFG; when a fake use appears in only one incoming block they prevent
+;; further sinking, and when identical fake uses appear on both sides they
+;; are sunk normally.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i1 %bool, ptr %p) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i1 [[BOOL:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[BOOL]], label %[[IF_ELSE:.*]], label %[[IF_THEN:.*]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    store ptr [[P]], ptr [[P]], align 8
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    store ptr [[P]], ptr [[P]], align 8
+; CHECK-NEXT:    notail call void (...) @llvm.fake.use(ptr [[P]])
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %bool, label %if.else, label %if.then
+
+common.ret:                                       ; preds = %if.else, %if.then
+  ret void
+
+if.then:                                          ; preds = %entry
+  store ptr %p, ptr %p, align 8
+  br label %common.ret
+
+if.else:                                          ; preds = %entry
+  store ptr %p, ptr %p, align 8
+  notail call void (...) @llvm.fake.use(ptr %p)
+  br label %common.ret
+}
+
+define void @bar(i1 %bool, ptr %p) {
+; CHECK-LABEL: define void @bar(
+; CHECK-SAME: i1 [[BOOL:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store ptr [[P]], ptr [[P]], align 8
+; CHECK-NEXT:    notail call void (...) @llvm.fake.use(ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %bool, label %if.else, label %if.then
+
+common.ret:                                       ; preds = %if.else, %if.then
+  ret void
+
+if.then:                                          ; preds = %entry
+  store ptr %p, ptr %p, align 8
+  notail call void (...) @llvm.fake.use(ptr %p)
+  br label %common.ret
+
+if.else:                                          ; preds = %entry
+  store ptr %p, ptr %p, align 8
+  notail call void (...) @llvm.fake.use(ptr %p)
+  br label %common.ret
+}
+
diff --git a/llvm/test/Verifier/invoke.ll b/llvm/test/Verifier/invoke.ll
index 20f61b9..11161e2 100644
--- a/llvm/test/Verifier/invoke.ll
+++ b/llvm/test/Verifier/invoke.ll
@@ -46,7 +46,7 @@ contb:
 
 define i8 @f2() personality ptr @__gxx_personality_v0 {
 entry:
-; CHECK: Cannot invoke an intrinsic other than donothing, patchpoint, statepoint, coro_resume, coro_destroy or clang.arc.attachedcall
+; CHECK: Cannot invoke an intrinsic other than donothing, patchpoint, statepoint, coro_resume, coro_destroy, clang.arc.attachedcall or wasm.(re)throw
   invoke void @llvm.trap()
   to label %cont unwind label %lpad
 
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 81d5fe0..34c1b87 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -29,7 +29,7 @@
 #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h"
 #include "llvm/ExecutionEngine/Orc/EPCEHFrameRegistrar.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/GetTapiInterface.h"
+#include "llvm/ExecutionEngine/Orc/GetDylibInterface.h"
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h"
 #include "llvm/ExecutionEngine/Orc/JITLinkReentryTrampolines.h"
@@ -2117,17 +2117,8 @@ static SmallVector<StringRef, 5> getSearchPathsFromEnvVar(Session &S) {
 }
 
 static Expected<std::unique_ptr<DefinitionGenerator>>
-LoadLibraryWeak(Session &S, StringRef InterfacePath) {
-  auto TapiFileBuffer = getFile(InterfacePath);
-  if (!TapiFileBuffer)
-    return TapiFileBuffer.takeError();
-
-  auto Tapi =
-      object::TapiUniversal::create((*TapiFileBuffer)->getMemBufferRef());
-  if (!Tapi)
-    return Tapi.takeError();
-
-  auto Symbols = getInterfaceFromTapiFile(S.ES, **Tapi);
+LoadLibraryWeak(Session &S, StringRef Path) {
+  auto Symbols = getDylibInterface(S.ES, Path);
   if (!Symbols)
     return Symbols.takeError();
 
@@ -2227,7 +2218,7 @@ static Error addLibraries(Session &S,
   StringRef StandardExtensions[] = {".so", ".dylib", ".dll", ".a", ".lib"};
   StringRef DynLibExtensionsOnly[] = {".so", ".dylib", ".dll"};
   StringRef ArchiveExtensionsOnly[] = {".a", ".lib"};
-  StringRef InterfaceExtensionsOnly = {".tbd"};
+  StringRef WeakLinkExtensionsOnly[] = {".dylib", ".tbd"};
 
   // Add -lx arguments to LibraryLoads.
   for (auto LibItr = Libraries.begin(), LibEnd = Libraries.end();
@@ -2260,7 +2251,7 @@ static Error addLibraries(Session &S,
     LibraryLoad LL;
     LL.LibName = *LibWeakItr;
     LL.Position = LibrariesWeak.getPosition(LibWeakItr - LibrariesWeak.begin());
-    LL.CandidateExtensions = InterfaceExtensionsOnly;
+    LL.CandidateExtensions = WeakLinkExtensionsOnly;
     LL.Modifier = LibraryLoad::Weak;
     LibraryLoadQueue.push_back(std::move(LL));
   }
@@ -2403,8 +2394,15 @@ static Error addLibraries(Session &S,
         case file_magic::pecoff_executable:
         case file_magic::elf_shared_object:
         case file_magic::macho_dynamically_linked_shared_lib: {
-          if (auto Err = S.loadAndLinkDynamicLibrary(JD, LibPath.data()))
-            return Err;
+          if (LL.Modifier == LibraryLoad::Weak) {
+            if (auto G = LoadLibraryWeak(S, LibPath.data()))
+              JD.addGenerator(std::move(*G));
+            else
+              return G.takeError();
+          } else {
+            if (auto Err = S.loadAndLinkDynamicLibrary(JD, LibPath.data()))
+              return Err;
+          }
           break;
         }
         case file_magic::archive:
diff --git a/llvm/tools/llvm-size/llvm-size.cpp b/llvm/tools/llvm-size/llvm-size.cpp
index 0d7bf24..afb6b23 100644
--- a/llvm/tools/llvm-size/llvm-size.cpp
+++ b/llvm/tools/llvm-size/llvm-size.cpp
@@ -76,7 +76,7 @@ static std::vector<StringRef> ArchFlags;
 static bool ELFCommons;
 static OutputFormatTy OutputFormat;
 static bool DarwinLongFormat;
-static RadixTy Radix;
+static RadixTy Radix = RadixTy::decimal;
 static bool TotalSizes;
 
 static std::vector<std::string> InputFilenames;
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index a1ea784..27c0e0b 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -1441,8 +1441,7 @@ TEST_F(OpenMPIRBuilderTest, CanonicalLoopSimple) {
   EXPECT_EQ(&Loop->getAfter()->front(), RetInst);
 }
 
-TEST_F(OpenMPIRBuilderTest, CanonicalLoopBounds) {
-  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+TEST_F(OpenMPIRBuilderTest, CanonicalLoopTripCount) {
   OpenMPIRBuilder OMPBuilder(*M);
   OMPBuilder.initialize();
   IRBuilder<> Builder(BB);
@@ -1458,17 +1457,8 @@ TEST_F(OpenMPIRBuilderTest, CanonicalLoopBounds) {
     Value *StartVal = ConstantInt::get(LCTy, Start);
     Value *StopVal = ConstantInt::get(LCTy, Stop);
     Value *StepVal = ConstantInt::get(LCTy, Step);
-    auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, llvm::Value *LC) {
-      return Error::success();
-    };
-    ASSERT_EXPECTED_INIT_RETURN(
-        CanonicalLoopInfo *, Loop,
-        OMPBuilder.createCanonicalLoop(Loc, LoopBodyGenCB, StartVal, StopVal,
-                                       StepVal, IsSigned, InclusiveStop),
-        -1);
-    Loop->assertOK();
-    Builder.restoreIP(Loop->getAfterIP());
-    Value *TripCount = Loop->getTripCount();
+    Value *TripCount = OMPBuilder.calculateCanonicalLoopTripCount(
+        Loc, StartVal, StopVal, StepVal, IsSigned, InclusiveStop);
     return cast<ConstantInt>(TripCount)->getValue().getZExtValue();
   };
 
diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
index ee511e0..39bd57d 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
@@ -66,6 +66,7 @@ executable("lldb-dap") {
     "Handler/PauseRequestHandler.cpp",
     "Handler/ReadMemoryRequestHandler.cpp",
     "Handler/RequestHandler.cpp",
+    "Handler/ResponseHandler.cpp",
     "Handler/RestartRequestHandler.cpp",
     "Handler/ScopesRequestHandler.cpp",
     "Handler/SetBreakpointsRequestHandler.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
index e6b9d70..8c8b9ff 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
@@ -34,7 +34,7 @@ static_library("Orc") {
     "EPCIndirectionUtils.cpp",
     "ExecutionUtils.cpp",
     "ExecutorProcessControl.cpp",
-    "GetTapiInterface.cpp",
+    "GetDylibInterface.cpp",
     "IRCompileLayer.cpp",
     "IRPartitionLayer.cpp",
     "IRTransformLayer.cpp",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
index 0b9e4b9..2ecbf8f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
@@ -25,7 +25,8 @@ class LLVMType<string typeName, string typeMnemonic, list<Trait> traits = []>
 //===----------------------------------------------------------------------===//
 
 def LLVMArrayType : LLVMType<"LLVMArray", "array", [
-    DeclareTypeInterfaceMethods<DataLayoutTypeInterface, ["getTypeSize"]>,
+    DeclareTypeInterfaceMethods<DataLayoutTypeInterface,
+                                ["getTypeSize", "getPreferredAlignment"]>,
     DeclareTypeInterfaceMethods<DestructurableTypeInterface>]> {
   let summary = "LLVM array type";
   let description = [{
@@ -124,7 +125,7 @@ def LLVMFunctionType : LLVMType<"LLVMFunction", "func"> {
 def LLVMStructType : LLVMType<"LLVMStruct", "struct", [
   MutableType,
   DeclareTypeInterfaceMethods<DataLayoutTypeInterface,
-    ["areCompatible", "verifyEntries"]>,
+    ["areCompatible", "verifyEntries", "getPreferredAlignment"]>,
   DeclareTypeInterfaceMethods<DestructurableTypeInterface,
     ["getSubelementIndexMap", "getTypeAtIndex"]>
 ]> {
@@ -257,7 +258,8 @@ def LLVMStructType : LLVMType<"LLVMStruct", "struct", [
 
 def LLVMPointerType : LLVMType<"LLVMPointer", "ptr", [
     DeclareTypeInterfaceMethods<DataLayoutTypeInterface, [
-      "getIndexBitwidth", "areCompatible", "verifyEntries"]>]> {
+      "getIndexBitwidth", "areCompatible", "verifyEntries",
+      "getPreferredAlignment"]>]> {
   let summary = "LLVM pointer type";
   let description = [{
     The `!llvm.ptr` type is an LLVM pointer type. This type typically represents
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
index efd708c..386d2f3 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
@@ -25,6 +25,22 @@ function(add_linalg_ods_yaml_gen yaml_ast_file output_file)
   set(LLVM_TARGET_DEPENDS ${LLVM_TARGET_DEPENDS} PARENT_SCOPE)
 endfunction()
 
+# NOTE: `add_mlir_interface(interface)` adds `interface` as a dependency of
+# mlir-generic-headers, i.e.:
+#   * mlir-generic-headers -> interface
+# In addition, we have an existing MLIR-wide dependency of:
+#   * mlir-headers -> mlir-generic-headers.
+# Now, observe that:
+#   1. The targets below define _new_ dependencies for mlir-headers.
+#   2. Before the new targets are defined, `add_linalg_ods_yaml_gen` updates
+#     LLVM_TARGET_DEPENDS.
+#   3. All tablegen targets pick-up LLVM_TARGET_DEPENDS.
+# In order to avoid cyclic dependencies, we need to invoke
+# `add_mlir_interface` (and update `mlir-generic-headers`) _before_
+# LLVM_TARGET_DEPENDS is updated and new dependencies for `mlir-headers` are
+# defined + added.
+add_mlir_interface(RelayoutOpInterface)
+
 # NOTE: LLVM_TARGET_DEPENDS gets picked up by tablegen targets to add file
 # level dependencies. This is gross but CMake requires depending on both
 # targets and generated files, and it must be done when the custom target is
@@ -77,3 +93,4 @@ mlir_tablegen(LinalgInterfaces.h.inc -gen-op-interface-decls)
 mlir_tablegen(LinalgInterfaces.cpp.inc -gen-op-interface-defs)
 add_public_tablegen_target(MLIRLinalgInterfacesIncGen)
 add_dependencies(mlir-headers MLIRLinalgInterfacesIncGen)
+
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
index 6f1c243..df32caf 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
@@ -221,4 +221,7 @@ LogicalResult verifyStructuredOpInterface(Operation *op);
 /// Include the generated interface declarations.
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h.inc"
 
+/// Include the generated relayout interface declarations.
+#include "mlir/Dialect/Linalg/IR/RelayoutOpInterface.h.inc"
+
 #endif // MLIR_DIALECT_LINALG_IR_LINALGINTERFACES_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index 247afc1..dbc1ac6 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -178,16 +178,6 @@ def LinalgConvolutionOpInterface : OpInterface<"ConvolutionOpInterface"> {
   ];
 }
 
-def LinalgRelayoutOpInterface : OpInterface<"RelayoutOpInterface"> {
-  let description = [{
-    A Linalg relayout-op is either linalg.pack or linalg.unpack.
-
-    While we could extend this interface with methods from Linalg_RelayoutOp,
-    this is currently not needed and left as a TODO.
-  }];
-  let cppNamespace = "::mlir::linalg";
-}
-
 def LinalgFillOpInterface : OpInterface<"FillOpInterface"> {
   let description = [{
     A fill operation is defined in general terms:
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
index a08a778..1e48a5e 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
@@ -23,6 +23,7 @@ include "mlir/Interfaces/DestinationStyleOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Dialect/Linalg/IR/LinalgInterfaces.td"
+include "mlir/Dialect/Linalg/IR/RelayoutOpInterface.td"
 include "mlir/IR/OpAsmInterface.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.h b/mlir/include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.h
new file mode 100644
index 0000000..5e3c625
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.h
@@ -0,0 +1,18 @@
+//===- RelayoutOpInterface.h - Relayout op interface ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the operation interface for relayout ops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_RELAYOUTOPINTERFACE_H_
+#define MLIR_DIALECT_RELAYOUTOPINTERFACE_H_
+
+#include "mlir/Dialect/Linalg/IR/RelayoutOpInterface.h.inc"
+
+#endif // MLIR_DIALECT_RELAYOUTOPINTERFACE_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.td b/mlir/include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.td
new file mode 100644
index 0000000..2dec2fc
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.td
@@ -0,0 +1,25 @@
+//===- RelayoutOpInterface.td ----- Intrerface Declaration -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LINALG_IR_RELAYOUTOPINTERFACE
+#define LINALG_IR_RELAYOUTOPINTERFACE
+
+include "mlir/Interfaces/DestinationStyleOpInterface.td"
+include "mlir/IR/OpBase.td"
+
+def LinalgRelayoutOpInterface : OpInterface<"RelayoutOpInterface"> {
+  let description = [{
+    A Linalg relayout-op is either linalg.pack or linalg.unpack.
+
+    While we could extend this interface with methods from Linalg_RelayoutOp,
+    this is currently not needed and left as a TODO.
+  }];
+  let cppNamespace = "::mlir::linalg";
+}
+
+#endif // LINALG_IR_RELAYOUTOPINTERFACE
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
index 14d72c3..bc377dc 100644
--- a/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
+++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrDialect.td
@@ -38,7 +38,8 @@ class Ptr_Type<string name, string typeMnemonic, list<Trait> traits = []>
 def Ptr_PtrType : Ptr_Type<"Ptr", "ptr", [
     MemRefElementTypeInterface,
     DeclareTypeInterfaceMethods<DataLayoutTypeInterface, [
-      "areCompatible", "getIndexBitwidth", "verifyEntries"]>
+      "areCompatible", "getIndexBitwidth", "verifyEntries",
+      "getPreferredAlignment"]>
   ]> {
   let summary = "pointer type";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
index ea2f457..63163b7 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
@@ -136,7 +136,7 @@ struct PipeliningOption {
   /// The callback passes the operation created along with the part of the
   /// pipeline and the iteration index. The iteration index is always 0 for the
   /// kernel. For the prologue and epilogue, it corresponds to the iteration
-  /// peeled out of the loop in the range [0, maxStage[.
+  /// peeled out of the loop in the range [0, maxStage].
   using AnnotationlFnType =
       std::function<void(Operation *, PipelinerPart, unsigned)>;
   AnnotationlFnType annotateFn = nullptr;
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td
index b7d6ec7..a4fe295 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains image ops for the SPIR-V dialect. It corresponds
-// to "3.37.10. Image Instructions" of the SPIR-V specification.
+// to "3.56.10. Image Instructions" of the SPIR-V specification.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,12 +19,56 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 
 // -----
 
-def SPIRV_ImageDrefGatherOp : SPIRV_Op<"ImageDrefGather", [Pure]> {
+class SPIRV_ValuesAreContained<string operand, list<string> values, string transform, string type, string getter> :
+  CPred<"::llvm::is_contained("
+    "{::mlir::spirv::" # type # "::" # !interleave(values, ", ::mlir::spirv::" # type # "::") # "},"
+    "::llvm::cast<::mlir::spirv::ImageType>(" # !subst("$_self", "$" # operand # ".getType()", transform) # ")." # getter # "()"
+  ")"
+>;
+
+class SPIRV_SampledOperandIs<string operand, list<string> values, string transform="$_self"> : PredOpTrait<
+  "the sampled operand of the underlying image must be " # !interleave(values, " or "),
+  SPIRV_ValuesAreContained<operand, values, transform, "ImageSamplerUseInfo", "getSamplerUseInfo"> 
+>;
+
+class SPIRV_MSOperandIs<string operand, list<string> values, string transform="$_self"> : PredOpTrait<
+  "the MS operand of the underlying image type must be " # !interleave(values, " or "),
+  SPIRV_ValuesAreContained<operand, values, transform, "ImageSamplingInfo", "getSamplingInfo"> 
+>;
+
+class SPIRV_DimIs<string operand, list<string> values, string transform="$_self"> : PredOpTrait<
+  "the Dim operand of the underlying image must be " # !interleave(values, " or "),
+  SPIRV_ValuesAreContained<operand, values, transform, "Dim", "getDim">
+>;
+
+class SPIRV_DimIsNot<string operand, list<string> values, string transform="$_self"> : PredOpTrait<
+  "the Dim operand of the underlying image must not be " # !interleave(values, " or "),
+  Neg<SPIRV_ValuesAreContained<operand, values, transform, "Dim", "getDim">>
+>;
+
+class SPIRV_NoneOrElementMatchImage<string operand, string image, string transform="$_self"> : PredOpTrait<
+  "the " # operand # " component type must match the image sampled type",
+  CPred<"::llvm::isa<NoneType>(cast<ImageType>(" # !subst("$_self", "$" # image # ".getType()", transform) # ").getElementType()) ||"
+        "(getElementTypeOrSelf($" # operand # ")"
+          "=="
+        "cast<ImageType>(" # !subst("$_self", "$" # image # ".getType()", transform) # ").getElementType())"
+  >
+>;
+
+def SPIRV_SampledImageTransform : StrFunc<"llvm::cast<spirv::SampledImageType>($_self).getImageType()">;
+
+// -----
+
+def SPIRV_ImageDrefGatherOp : SPIRV_Op<"ImageDrefGather", 
+    [Pure,
+     SPIRV_DimIs<"sampled_image", ["Dim2D", "Cube", "Rect"], SPIRV_SampledImageTransform.result>,
+     SPIRV_MSOperandIs<"sampled_image", ["SingleSampled"], SPIRV_SampledImageTransform.result>,
+     SPIRV_NoneOrElementMatchImage<"result", "sampled_image", SPIRV_SampledImageTransform.result>]>{
   let summary = "Gathers the requested depth-comparison from four texels.";
 
   let description = [{
     Result Type must be a vector of four components of floating-point type
-    or integer type.  Its components must be the same as Sampled Type of the
+    or integer type. Its components must be the same as Sampled Type of the
     underlying OpTypeImage (unless that underlying Sampled Type is
     OpTypeVoid). It has one component per gathered texel.
 
@@ -32,8 +76,8 @@ def SPIRV_ImageDrefGatherOp : SPIRV_Op<"ImageDrefGather", [Pure]> {
     OpTypeImage must have a Dim of 2D, Cube, or Rect. The MS operand of the
     underlying OpTypeImage must be 0.
 
-    Coordinate  must be a scalar or vector of floating-point type.  It
-    contains (u[, v] … [, array layer]) as needed by the definition of
+    Coordinate must be a scalar or vector of floating-point type. It
+    contains (u[, v] ... [, array layer]) as needed by the definition of
     Sampled Image.
 
     Dref is the depth-comparison reference value. It must be a 32-bit
@@ -44,8 +88,8 @@ def SPIRV_ImageDrefGatherOp : SPIRV_Op<"ImageDrefGather", [Pure]> {
     #### Example:
 
     ```mlir
-    %0 = spirv.ImageDrefGather %1 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %2 : vector<4xf32>, %3 : f32 -> vector<4xi32>
-    %0 = spirv.ImageDrefGather %1 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %2 : vector<4xf32>, %3 : f32 ["NonPrivateTexel"] : f32, f32 -> vector<4xi32>
+    %0 = spirv.ImageDrefGather %1, %2, %3 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, vector<4xf32>, f32 -> vector<4xi32>
+    %0 = spirv.ImageDrefGather %1, %2, %3 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, vector<4xf32>, f32 ["NonPrivateTexel"] -> vector<4xi32>
     ```
   }];
 
@@ -57,23 +101,24 @@ def SPIRV_ImageDrefGatherOp : SPIRV_Op<"ImageDrefGather", [Pure]> {
   ];
 
   let arguments = (ins
-    SPIRV_AnySampledImage:$sampledimage,
+    SPIRV_AnySampledImage:$sampled_image,
     SPIRV_ScalarOrVectorOf<SPIRV_Float>:$coordinate,
-    SPIRV_Float:$dref,
-    OptionalAttr<SPIRV_ImageOperandsAttr>:$imageoperands,
+    SPIRV_Float32:$dref,
+    OptionalAttr<SPIRV_ImageOperandsAttr>:$image_operands,
     Variadic<SPIRV_Type>:$operand_arguments
   );
 
   let results = (outs
-    SPIRV_Vector:$result
+    AnyTypeOf<[SPIRV_Vec4<SPIRV_Integer>, SPIRV_Vec4<SPIRV_Float>]>:$result
   );
 
-  let assemblyFormat = [{$sampledimage `:` type($sampledimage) `,`
-                         $coordinate `:` type($coordinate) `,` $dref `:` type($dref)
-                         custom<ImageOperands>($imageoperands)
-                         ( `(` $operand_arguments^ `:` type($operand_arguments) `)`)?
-                         attr-dict
-                         `->` type($result)}];
+
+  let assemblyFormat = [{
+    $sampled_image `,` $coordinate `,` $dref custom<ImageOperands>($image_operands) ( `(` $operand_arguments^ `)` )? attr-dict 
+    `:` type($sampled_image) `,` type($coordinate) `,` type($dref) ( `(` type($operand_arguments)^ `)` )?
+    `->` type($result) 
+  }];
+
 }
 
 // -----
@@ -82,7 +127,7 @@ def SPIRV_ImageQuerySizeOp : SPIRV_Op<"ImageQuerySize", [Pure]> {
   let summary = "Query the dimensions of Image, with no level of detail.";
 
   let description = [{
-    Result Type must be an integer type scalar or vector.  The number of
+    Result Type must be an integer type scalar or vector. The number of
     components must be:
 
     1 for the 1D and Buffer dimensionalities,
@@ -130,12 +175,15 @@ def SPIRV_ImageQuerySizeOp : SPIRV_Op<"ImageQuerySize", [Pure]> {
     SPIRV_ScalarOrVectorOf<SPIRV_Integer>:$result
   );
 
-  let assemblyFormat = "attr-dict $image `:` type($image) `->` type($result)";
+  let assemblyFormat = "$image attr-dict `:` type($image) `->` type($result)";
 }
 
 // -----
 
-def SPIRV_ImageWriteOp : SPIRV_Op<"ImageWrite", []> {
+def SPIRV_ImageWriteOp : SPIRV_Op<"ImageWrite",
+    [SPIRV_SampledOperandIs<"image", ["SamplerUnknown", "NoSampler"]>,
+     SPIRV_DimIsNot<"image", ["SubpassData"]>,
+     SPIRV_NoneOrElementMatchImage<"texel", "image">]> {
   let summary = "Write a texel to an image without a sampler.";
 
   let description = [{
@@ -163,7 +211,7 @@ def SPIRV_ImageWriteOp : SPIRV_Op<"ImageWrite", []> {
     #### Example:
 
     ```mlir
-    spirv.ImageWrite %0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %1 : vector<2xsi32>, %2 : vector<4xf32>
+    spirv.ImageWrite %0, %1, %2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, vector<2xsi32>, vector<4xf32>
     ```
   }];
 
@@ -177,20 +225,18 @@ def SPIRV_ImageWriteOp : SPIRV_Op<"ImageWrite", []> {
 
   let results = (outs);
 
-  let assemblyFormat = [{$image `:` type($image) `,`
-                         $coordinate `:` type($coordinate) `,`
-                         $texel `:` type($texel)
-                         custom<ImageOperands>($image_operands)
-                         ( `(` $operand_arguments^ `:` type($operand_arguments) `)`)?
-                         attr-dict}];
+  let assemblyFormat = [{
+    $image `,` $coordinate `,` $texel custom<ImageOperands>($image_operands) ( `(` $operand_arguments^ `)`)? attr-dict
+    `:` type($image) `,` type($coordinate) `,` type($texel) ( `(` type($operand_arguments)^ `)`)?
+  }];
 }
 
 // -----
 
 def SPIRV_ImageOp : SPIRV_Op<"Image",
     [Pure,
-     TypesMatchWith<"type of 'result' matches image type of 'sampledimage'",
-                    "sampledimage", "result",
+     TypesMatchWith<"type of 'result' matches image type of 'sampled_image'",
+                    "sampled_image", "result",
                     "::llvm::cast<spirv::SampledImageType>($_self).getImageType()">]> {
   let summary = "Extract the image from a sampled image.";
 
@@ -210,14 +256,14 @@ def SPIRV_ImageOp : SPIRV_Op<"Image",
   }];
 
   let arguments = (ins
-    SPIRV_AnySampledImage:$sampledimage
+    SPIRV_AnySampledImage:$sampled_image
   );
 
   let results = (outs
     SPIRV_AnyImage:$result
   );
 
-  let assemblyFormat = "attr-dict $sampledimage `:` type($sampledimage)";
+  let assemblyFormat = "$sampled_image attr-dict `:` type($sampled_image)";
 
   let hasVerifier = 0;
 }
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index 455d07f..2369247 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -237,23 +237,24 @@ class Tosa_I32EnumAttr<string name, string description, string mnemonic,
 // VARIABLE     : Stateful variable operations.
 //===----------------------------------------------------------------------===//
 
+def Tosa_NONE : I32EnumAttrCase<"none", 0>;
 def Tosa_PRO_INT   : I32EnumAttrCase<"pro_int", 1>;
 def Tosa_PRO_FP   : I32EnumAttrCase<"pro_fp", 2>;
-def Tosa_NONE : I32EnumAttrCase<"none", 3>;
 
-def Tosa_EXT_INT16    : I32EnumAttrCase<"int16", 1>;
-def Tosa_EXT_INT4     : I32EnumAttrCase<"int4", 2>;
-def Tosa_EXT_BF16     : I32EnumAttrCase<"bf16", 3>;
-def Tosa_EXT_FP8E4M3  : I32EnumAttrCase<"fp8e4m3", 4>;
-def Tosa_EXT_FP8E5M2  : I32EnumAttrCase<"fp8e5m2", 5>;
-def Tosa_EXT_FFT      : I32EnumAttrCase<"fft", 6>;
-def Tosa_EXT_VARIABLE : I32EnumAttrCase<"variable", 7>;
-def Tosa_EXT_NONE     : I32EnumAttrCase<"none", 8>;
+def Tosa_EXT_NONE         : I32EnumAttrCase<"none", 0>;
+def Tosa_EXT_INT16        : I32EnumAttrCase<"int16", 1>;
+def Tosa_EXT_INT4         : I32EnumAttrCase<"int4", 2>;
+def Tosa_EXT_BF16         : I32EnumAttrCase<"bf16", 3>;
+def Tosa_EXT_FP8E4M3      : I32EnumAttrCase<"fp8e4m3", 4>;
+def Tosa_EXT_FP8E5M2      : I32EnumAttrCase<"fp8e5m2", 5>;
+def Tosa_EXT_FFT          : I32EnumAttrCase<"fft", 6>;
+def Tosa_EXT_VARIABLE     : I32EnumAttrCase<"variable", 7>;
+def Tosa_EXT_CONTROLFLOW  : I32EnumAttrCase<"controlflow", 8>;
 
 def Tosa_ExtensionAttr
     : Tosa_I32EnumAttr<"Extension", "supported TOSA extensions", "ext", [
       Tosa_EXT_INT16, Tosa_EXT_INT4, Tosa_EXT_BF16, Tosa_EXT_FP8E4M3,
-      Tosa_EXT_FP8E5M2, Tosa_EXT_FFT, Tosa_EXT_VARIABLE, Tosa_EXT_NONE
+      Tosa_EXT_FP8E5M2, Tosa_EXT_FFT, Tosa_EXT_VARIABLE, Tosa_EXT_CONTROLFLOW, Tosa_EXT_NONE
     ]>;
 
 def Tosa_ExtensionArrayAttr
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 8f822f9..a7373d8 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -2023,7 +2023,7 @@ def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
 
   let arguments = (ins
     Tosa_Tensor:$input1,
-    Tosa_Int32Tensor:$perms
+    DenseI32ArrayAttr:$perms
   );
 
   let results = (
@@ -2035,10 +2035,6 @@ def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
     Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
   ];
 
-  let extraClassDeclaration = [{
-    LogicalResult getConstantPerms(llvm::SmallVector<int32_t> &perms);
-  }];
-
   let hasCanonicalizer = 1;
   let hasFolder = 1;
   let hasVerifier = 1;
@@ -2436,8 +2432,8 @@ def Tosa_IfOp : Tosa_Op<"cond_if",
   );
 
   list<Availability> availability = [
-    Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[]>,
+    Profile<[]>,
+    Extension<[Tosa_EXT_CONTROLFLOW]>,
   ];
 
   let regions = (region
@@ -2477,8 +2473,8 @@ def Tosa_WhileOp : Tosa_Op<"while_loop", [
   );
 
   list<Availability> availability = [
-    Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[]>,
+    Profile<[]>,
+    Extension<[Tosa_EXT_CONTROLFLOW]>,
   ];
 
   let regions = (region
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
index 5de2a59..064264e 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
@@ -143,6 +143,7 @@ public:
     case Extension::fft:
       return {Profile::pro_fp};
     case Extension::variable:
+    case Extension::controlflow:
       return {Profile::pro_fp, Profile::pro_int};
     case Extension::none:
       return {};
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index d4035d1..fbe0ed2 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -997,13 +997,25 @@ public:
     if (!properties) {
       T *p = new T{};
       properties = p;
+#if defined(__clang__)
+#if __has_warning("-Wdangling-assignment-gsl")
+#pragma clang diagnostic push
+// https://github.com/llvm/llvm-project/issues/126600
+#pragma clang diagnostic ignored "-Wdangling-assignment-gsl"
+#endif
+#endif
       propertiesDeleter = [](OpaqueProperties prop) {
         delete prop.as<const T *>();
       };
-      propertiesSetter = [](OpaqueProperties new_prop,
+      propertiesSetter = [](OpaqueProperties newProp,
                             const OpaqueProperties prop) {
-        *new_prop.as<T *>() = *prop.as<const T *>();
+        *newProp.as<T *>() = *prop.as<const T *>();
       };
+#if defined(__clang__)
+#if __has_warning("-Wdangling-assignment-gsl")
+#pragma clang diagnostic pop
+#endif
+#endif
       propertiesId = TypeID::get<T>();
     }
     assert(propertiesId == TypeID::get<T>() && "Inconsistent properties");
diff --git a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
index 0d09b92..3e4733d 100644
--- a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
+++ b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
@@ -598,7 +598,11 @@ def DataLayoutTypeInterface : TypeInterface<"DataLayoutTypeInterface"> {
       /*retTy=*/"uint64_t",
       /*methodName=*/"getPreferredAlignment",
       /*args=*/(ins "const ::mlir::DataLayout &":$dataLayout,
-                    "::mlir::DataLayoutEntryListRef":$params)
+                    "::mlir::DataLayoutEntryListRef":$params),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return $_type.getABIAlignment(dataLayout, params);
+      }]
     >,
     InterfaceMethod<
       /*description=*/"Returns the bitwidth that should be used when "
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.td b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
index c6bffe3..6c95b48 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.td
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
@@ -180,12 +180,12 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
         For loop operations that dont yield a value, this should return
         std::nullopt.
       }],
-      /*retTy=*/"std::optional<::llvm::MutableArrayRef<::mlir::OpOperand>>",
+      /*retTy=*/"::std::optional<::llvm::MutableArrayRef<::mlir::OpOperand>>",
       /*methodName=*/"getYieldedValuesMutable",
       /*args=*/(ins),
       /*methodBody=*/"",
       /*defaultImplementation=*/[{
-        return std::nullopt;
+        return ::std::nullopt;
       }]
     >,
     InterfaceMethod<[{
@@ -239,7 +239,7 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
     /// Returns if a block is inside a loop (within the current function). This
     /// can either be because the block is nested inside a LoopLikeInterface, or
     /// because the control flow graph is cyclic
-    static bool blockIsInLoop(Block *block);
+    static bool blockIsInLoop(::mlir::Block *block);
   }];
 
   let extraSharedClassDeclaration = [{
@@ -249,7 +249,7 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       auto inductionVars = $_op.getLoopInductionVars();
       if (inductionVars.has_value() && (*inductionVars).size() == 1)
           return (*inductionVars)[0];
-        return std::nullopt;
+        return ::std::nullopt;
     }
     /// Return the single lower bound value or attribute if it exists, otherwise
     /// return std::nullopt.
@@ -257,7 +257,7 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       auto lowerBounds = $_op.getLoopLowerBounds();
       if (lowerBounds.has_value() && (*lowerBounds).size() == 1)
           return (*lowerBounds)[0];
-      return std::nullopt;
+      return ::std::nullopt;
     }
     /// Return the single step value or attribute if it exists, otherwise
     /// return std::nullopt.
@@ -265,7 +265,7 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       auto steps = $_op.getLoopSteps();
       if (steps.has_value() && (*steps).size() == 1)
           return (*steps)[0];
-      return std::nullopt;
+      return ::std::nullopt;
     }
     /// Return the single upper bound value or attribute if it exists, otherwise
     /// return std::nullopt.
@@ -273,7 +273,7 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       auto upperBounds = $_op.getLoopUpperBounds();
       if (upperBounds.has_value() && (*upperBounds).size() == 1)
           return (*upperBounds)[0];
-      return std::nullopt;
+      return ::std::nullopt;
     }
 
     /// Append the specified additional "init" operands: replace this loop with
@@ -287,8 +287,9 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
                                           bool replaceInitOperandUsesInLoop) {
       return $_op.replaceWithAdditionalYields(
           rewriter, newInitOperands, replaceInitOperandUsesInLoop,
-          [](OpBuilder &b, Location loc, ArrayRef<BlockArgument> newBBArgs) {
-            return SmallVector<Value>(newBBArgs);
+          [](::mlir::OpBuilder &b, ::mlir::Location loc,
+             ::mlir::ArrayRef<::mlir::BlockArgument> newBBArgs) {
+            return ::mlir::SmallVector<::mlir::Value>(newBBArgs);
           });
     }
 
@@ -298,9 +299,9 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       auto mutableValues = $_op.getYieldedValuesMutable();
       if (!mutableValues || mutableValues->empty())
         return {};
-      Operation *yieldOp = mutableValues->begin()->getOwner();
+      ::mlir::Operation *yieldOp = mutableValues->begin()->getOwner();
       unsigned firstOperandIndex = mutableValues->begin()->getOperandNumber();
-      return OperandRange(
+      return ::mlir::OperandRange(
           yieldOp->operand_begin() + firstOperandIndex,
           yieldOp->operand_begin() + firstOperandIndex + mutableValues->size());
     }
@@ -312,7 +313,7 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       if (initsMutable.empty())
         return ::mlir::OperandRange($_op->operand_end(), $_op->operand_end());
       unsigned firstOperandIndex = initsMutable.begin()->getOperandNumber();
-      return OperandRange(
+      return ::mlir::OperandRange(
           $_op->operand_begin() + firstOperandIndex,
           $_op->operand_begin() + firstOperandIndex + initsMutable.size());
     }
@@ -320,99 +321,103 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
     /// Return the region iter_arg that corresponds to the given init operand.
     /// Return an "empty" block argument if the given operand is not an init
     /// operand of this loop op.
-    BlockArgument getTiedLoopRegionIterArg(OpOperand *opOperand) {
+    ::mlir::BlockArgument getTiedLoopRegionIterArg(
+        ::mlir::OpOperand *opOperand) {
       auto initsMutable = $_op.getInitsMutable();
-      auto it = llvm::find(initsMutable, *opOperand);
+      auto it = ::llvm::find(initsMutable, *opOperand);
       if (it == initsMutable.end())
         return {};
-      return $_op.getRegionIterArgs()[std::distance(initsMutable.begin(), it)];
+      return $_op.getRegionIterArgs()[
+          ::std::distance(initsMutable.begin(), it)];
     }
 
     /// Return the region iter_arg that corresponds to the given loop result.
     /// Return an "empty" block argument if the given OpResult is not a loop
     /// result or if this op does not expose any loop results.
-    BlockArgument getTiedLoopRegionIterArg(OpResult opResult) {
+    ::mlir::BlockArgument getTiedLoopRegionIterArg(::mlir::OpResult opResult) {
       auto loopResults = $_op.getLoopResults();
       if (!loopResults)
         return {};
-      auto it = llvm::find(*loopResults, opResult);
+      auto it = ::llvm::find(*loopResults, opResult);
       if (it == loopResults->end())
         return {};
-      return $_op.getRegionIterArgs()[std::distance(loopResults->begin(), it)];
+      return $_op.getRegionIterArgs()[
+          ::std::distance(loopResults->begin(), it)];
     }
 
     /// Return the init operand that corresponds to the given region iter_arg.
     /// Return "nullptr" if the given block argument is not a region iter_arg
     /// of this loop op.
-    OpOperand *getTiedLoopInit(BlockArgument bbArg) {
+    ::mlir::OpOperand *getTiedLoopInit(::mlir::BlockArgument bbArg) {
       auto iterArgs = $_op.getRegionIterArgs();
-      auto it = llvm::find(iterArgs, bbArg);
+      auto it = ::llvm::find(iterArgs, bbArg);
       if (it == iterArgs.end())
         return {};
-      return &$_op.getInitsMutable()[std::distance(iterArgs.begin(), it)];
+      return &$_op.getInitsMutable()[::std::distance(iterArgs.begin(), it)];
     }
 
     /// Return the init operand that corresponds to the given loop result.
     /// Return "nullptr" if the given OpResult is not a loop result or if this
     /// op does not expose any loop results.
-    OpOperand *getTiedLoopInit(OpResult opResult) {
+    ::mlir::OpOperand *getTiedLoopInit(::mlir::OpResult opResult) {
       auto loopResults = $_op.getLoopResults();
       if (!loopResults)
         return nullptr;
-      auto it = llvm::find(*loopResults, opResult);
+      auto it = ::llvm::find(*loopResults, opResult);
       if (it == loopResults->end())
         return nullptr;
-      return &$_op.getInitsMutable()[std::distance(loopResults->begin(), it)];
+      return &$_op.getInitsMutable()[::std::distance(
+          loopResults->begin(), it)];
     }
 
     /// Return the yielded value that corresponds to the given region iter_arg.
     /// Return "nullptr" if the given block argument is not a region iter_arg
     /// of this loop op or if there is no yield corresponding to this `bbArg`.
-    OpOperand *getTiedLoopYieldedValue(BlockArgument bbArg) {
+    ::mlir::OpOperand *getTiedLoopYieldedValue(::mlir::BlockArgument bbArg) {
       auto iterArgs = $_op.getRegionIterArgs();
-      auto it = llvm::find(iterArgs, bbArg);
+      auto it = ::llvm::find(iterArgs, bbArg);
       if (it == iterArgs.end())
         return {};
-      std::optional<llvm::MutableArrayRef<::mlir::OpOperand>> yieldValues =
+      ::std::optional<::llvm::MutableArrayRef<::mlir::OpOperand>> yieldValues =
         $_op.getYieldedValuesMutable();
       if (!yieldValues)
         return {};
-      return &yieldValues.value()[std::distance(iterArgs.begin(), it)];
+      return &yieldValues.value()[::std::distance(iterArgs.begin(), it)];
     }
 
     /// Return the loop result that corresponds to the given init operand.
     /// Return an "empty" OpResult if the given operand is not an init operand
     /// of this loop op or if this op does not expose any loop results.
-    OpResult getTiedLoopResult(OpOperand *opOperand) {
+    ::mlir::OpResult getTiedLoopResult(::mlir::OpOperand *opOperand) {
       auto loopResults = $_op.getLoopResults();
       if (!loopResults)
         return {};
       auto initsMutable = $_op.getInitsMutable();
-      auto it = llvm::find(initsMutable, *opOperand);
+      auto it = ::llvm::find(initsMutable, *opOperand);
       if (it == initsMutable.end())
         return {};
-      return (*loopResults)[std::distance(initsMutable.begin(), it)];
+      return (*loopResults)[::std::distance(initsMutable.begin(), it)];
     }
 
     /// Return the loop result that corresponds to the given region iter_arg.
     /// Return an "empty" OpResult if the given block argument is not a region
     /// iter_arg of this loop op or if this op does not expose any loop results.
-    OpResult getTiedLoopResult(BlockArgument bbArg) {
+    ::mlir::OpResult getTiedLoopResult(::mlir::BlockArgument bbArg) {
       auto loopResults = $_op.getLoopResults();
       if (!loopResults)
         return {};
       auto iterArgs = $_op.getRegionIterArgs();
-      auto it = llvm::find(iterArgs, bbArg);
+      auto it = ::llvm::find(iterArgs, bbArg);
       if (it == iterArgs.end())
         return {};
-      return (*loopResults)[std::distance(iterArgs.begin(), it)];
+      return (*loopResults)[::std::distance(iterArgs.begin(), it)];
     }
   }];
 
   let verifyWithRegions = 1;
 
   let verify = [{
-    return detail::verifyLoopLikeOpInterface($_op);
+    return ::mlir::detail::verifyLoopLikeOpInterface($_op);
   }];
 }
 
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 7b70b3a..607667f 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -32,10 +32,47 @@
 #include "llvm/ADT/Sequence.h"
 
 #include <numeric>
+#include <type_traits>
 
 using namespace mlir;
 using namespace mlir::tosa;
 
+// Helper function to materialize the semantically correct compare and select
+// operations given a binary operation with a specific NaN propagation mode.
+//
+// In the case of "PROPAGATE" semantics no compare and selection is required and
+// this function does nothing.
+//
+// In the case of "IGNORE" semantics this function materializes a comparison of
+// the current operands to the op which will return true for any NaN
+// argument and then selects between the non-NaN operation argument and the
+// calculated result based on whether the lhs or rhs is NaN or not. In pseudo
+// code:
+//
+// binary<op>(lhs, rhs):
+//   result = op(lhs, rhs)
+//   if lhs == NaN return rhs
+//   if rhs == NaN return lhs
+//   return result
+template <typename OpTy>
+static Value
+materializeBinaryNanCheckIfRequired(OpTy op, PatternRewriter &rewriter,
+                                    Value lhs, Value rhs, Value result) {
+  auto nanMode = op.getNanMode();
+  if (nanMode == "PROPAGATE")
+    return result;
+
+  // Unordered comparison of NaN against itself will always return true.
+  Value lhsIsNaN = rewriter.create<arith::CmpFOp>(
+      op.getLoc(), arith::CmpFPredicate::UNO, lhs, lhs);
+  Value rhsIsNaN = rewriter.create<arith::CmpFOp>(
+      op.getLoc(), arith::CmpFPredicate::UNO, rhs, rhs);
+  Value rhsOrResult =
+      rewriter.create<arith::SelectOp>(op.getLoc(), lhsIsNaN, rhs, result);
+  return rewriter.create<arith::SelectOp>(op.getLoc(), rhsIsNaN, lhs,
+                                          rhsOrResult);
+}
+
 template <typename T>
 static arith::ConstantOp
 createConstFromIntAttribute(Operation *op, const std::string &attrName,
@@ -367,7 +404,9 @@ static Value createLinalgBodyCalculationForElementwiseOp(
 
   // tosa::MaximumOp
   if (isa<tosa::MaximumOp>(op) && isa<FloatType>(elementTy)) {
-    return rewriter.create<arith::MaximumFOp>(loc, args[0], args[1]);
+    auto max = rewriter.create<arith::MaximumFOp>(loc, args[0], args[1]);
+    return materializeBinaryNanCheckIfRequired(llvm::cast<tosa::MaximumOp>(op),
+                                               rewriter, args[0], args[1], max);
   }
 
   if (isa<tosa::MaximumOp>(op) && elementTy.isSignlessInteger()) {
@@ -376,7 +415,9 @@ static Value createLinalgBodyCalculationForElementwiseOp(
 
   // tosa::MinimumOp
   if (isa<tosa::MinimumOp>(op) && isa<FloatType>(elementTy)) {
-    return rewriter.create<arith::MinimumFOp>(loc, args[0], args[1]);
+    auto min = rewriter.create<arith::MinimumFOp>(loc, args[0], args[1]);
+    return materializeBinaryNanCheckIfRequired(llvm::cast<tosa::MinimumOp>(op),
+                                               rewriter, args[0], args[1], min);
   }
 
   if (isa<tosa::MinimumOp>(op) && elementTy.isSignlessInteger()) {
@@ -404,7 +445,31 @@ static Value createLinalgBodyCalculationForElementwiseOp(
         loc, elementTy, rewriter.getFloatAttr(elementTy, minApf));
     auto max = rewriter.create<arith::ConstantOp>(
         loc, elementTy, rewriter.getFloatAttr(elementTy, maxApf));
-    return clampFloatHelper(loc, args[0], min, max, rewriter);
+    auto result = clampFloatHelper(loc, args[0], min, max, rewriter);
+
+    auto clampOp = llvm::cast<tosa::ClampOp>(op);
+    const auto nanMode = clampOp.getNanMode();
+    // In the case of "PROPAGATE" semantics no compare and selection is
+    // required.
+    if (nanMode == "PROPAGATE")
+      return result;
+
+    // In the case of "IGNORE" semantics materialize a comparison
+    // of the current operand to the reduction which will return true for a NaN
+    // argument and then selects between the initial reduction value and the
+    // calculated result based on whether the argument is NaN or not. In pseudo
+    // code:
+    //
+    // reduce<op>(x, init):
+    //   result = op(init, x)
+    //   return init if x == NaN else result
+
+    // Unordered comparison of NaN against itself will always return true.
+    Value isNaN = rewriter.create<arith::CmpFOp>(
+        op->getLoc(), arith::CmpFPredicate::UNO, args[0], args[0]);
+    // TOSA specifies that in "ignore" NaN mode the result is "min" if the input
+    // is NaN.
+    return rewriter.create<arith::SelectOp>(op->getLoc(), isNaN, min, result);
   }
 
   if (isa<tosa::ClampOp>(op) && isa<IntegerType>(elementTy)) {
@@ -1078,7 +1143,8 @@ static Value createLinalgBodyCalculationForReduceOp(Operation *op,
 // Performs the match and rewrite for reduction operations. This includes
 // declaring a correctly sized initial value, and the linalg.generic operation
 // that reduces across the specified axis.
-static LogicalResult reduceMatchAndRewriteHelper(Operation *op, uint64_t axis,
+template <typename OpTy>
+static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
                                                  PatternRewriter &rewriter) {
   auto loc = op->getLoc();
   auto inputTy = cast<ShapedType>(op->getOperand(0).getType());
@@ -1096,6 +1162,9 @@ static LogicalResult reduceMatchAndRewriteHelper(Operation *op, uint64_t axis,
     }
   }
 
+  SmallVector<Value> inputs, outputs;
+  inputs.push_back(input);
+
   // First fill the output buffer with the init value.
   auto emptyTensor =
       rewriter
@@ -1113,26 +1182,127 @@ static LogicalResult reduceMatchAndRewriteHelper(Operation *op, uint64_t axis,
                           .create<linalg::FillOp>(loc, ValueRange{fillValue},
                                                   ValueRange{emptyTensor})
                           .result();
+  outputs.push_back(filledTensor);
+
+  bool isNanIgnoreMode = false;
+  if constexpr (std::is_same_v<OpTy, tosa::ReduceMinOp> ||
+                std::is_same_v<OpTy, tosa::ReduceMaxOp>) {
+    if (op.getNanMode() == "IGNORE") {
+      isNanIgnoreMode = true;
+      // Because the TOSA spec requires the result be NaN iff all elements in
+      // the reduction are NaN we can't simply perform a compare and select.
+      // Additionally we have to keep track of whether we've seen any non-NaN
+      // values and then do a final select based on this predicate.
+      auto trueAttr = rewriter.getBoolAttr(true);
+      auto trueValue = rewriter.create<arith::ConstantOp>(loc, trueAttr);
+      auto emptyBoolTensor =
+          rewriter
+              .create<tensor::EmptyOp>(loc, reduceShape, trueValue.getType(),
+                                       dynDims)
+              .getResult();
+      auto allResultsNaNTensor =
+          rewriter
+              .create<linalg::FillOp>(loc, ValueRange{trueValue},
+                                      ValueRange{emptyBoolTensor})
+              .result();
+      // Note that because the linalg::ReduceOp has two variadic arguments
+      // (inputs and outputs) and it has the SameVariadicOperandSize trait we
+      // need to have the same number of inputs and outputs.
+      //
+      // The second input isn't actually used anywhere since the value used to
+      // update the NaN flag is calculated inside the body of the reduction and
+      // then used to update an out value.
+      // In order to satisfy type constraints we just pass another copy of the
+      // input here.
+      inputs.push_back(input);
+      outputs.push_back(allResultsNaNTensor);
+    }
+  }
 
   bool didEncounterError = false;
-  auto linalgOp = rewriter.create<linalg::ReduceOp>(
-      loc, input, filledTensor, axis,
+  linalg::LinalgOp linalgOp = rewriter.create<linalg::ReduceOp>(
+      loc, inputs, outputs, axis,
       [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange blockArgs) {
+        std::array<Value, 2> binaryArgs{
+            blockArgs[0], isNanIgnoreMode ? blockArgs[2] : blockArgs[1]};
         auto result = createLinalgBodyCalculationForReduceOp(
-            op, blockArgs, elementTy, rewriter);
+            op, binaryArgs, elementTy, rewriter);
         if (result)
           didEncounterError = true;
 
-        nestedBuilder.create<linalg::YieldOp>(loc, result);
+        SmallVector<Value> resultsToYield;
+        if (isNanIgnoreMode) {
+          auto inputValue = blockArgs[0];
+          auto initialValue = blockArgs[2];
+          auto oldAllResultsNanFlagValue = blockArgs[3];
+
+          // Unordered comparison of NaN against itself will always return true.
+          Value isNaN = nestedBuilder.create<arith::CmpFOp>(
+              op->getLoc(), arith::CmpFPredicate::UNO, inputValue, inputValue);
+          // If we've encountered a NaN, take the non-NaN value.
+          auto selectOp = nestedBuilder.create<arith::SelectOp>(
+              op->getLoc(), isNaN, initialValue, result);
+          // Update the flag which keeps track of whether we have seen a non-NaN
+          // value.
+          auto newAllResultsNanFlagValue = nestedBuilder.create<arith::AndIOp>(
+              op->getLoc(), oldAllResultsNanFlagValue, isNaN);
+          resultsToYield.push_back(selectOp);
+          resultsToYield.push_back(newAllResultsNanFlagValue);
+        } else {
+          resultsToYield.push_back(result);
+        }
+        nestedBuilder.create<linalg::YieldOp>(loc, resultsToYield);
       });
 
   if (!didEncounterError)
     return rewriter.notifyMatchFailure(
         op, "unable to create linalg.generic body for reduce op");
 
+  if (isNanIgnoreMode) {
+    // Materialize a check to see whether we encountered any non-NaN values, if
+    // we didn't we need to select a tensor of NaNs since the result will just
+    // be the initial identity value propagated through all the compares and
+    // selects inside the reduction.
+
+    // Create a tensor full of NaNs.
+    auto nanValueAttr = rewriter.getFloatAttr(
+        elementTy,
+        APFloat::getNaN(cast<FloatType>(elementTy).getFloatSemantics(), false));
+    auto nanValue = rewriter.create<arith::ConstantOp>(loc, nanValueAttr);
+    auto emptyNanTensor =
+        rewriter
+            .create<tensor::EmptyOp>(loc, reduceShape,
+                                     resultTy.getElementType(), dynDims)
+            .getResult();
+    auto nanFilledTensor =
+        rewriter
+            .create<linalg::FillOp>(loc, ValueRange{nanValue},
+                                    ValueRange{emptyNanTensor})
+            .result();
+
+    // Create an empty tensor, non need to fill this since it will be
+    // overwritten by the select.
+    auto finalEmptyTensor =
+        rewriter
+            .create<tensor::EmptyOp>(loc, reduceShape,
+                                     resultTy.getElementType(), dynDims)
+            .getResult();
+
+    // Do a selection between the tensors akin to:
+    // result = NaN if "all results NaN" else result.
+    SmallVector<Value> ins, outs;
+    ins.push_back(linalgOp->getOpResult(1));
+    ins.push_back(nanFilledTensor);
+    ins.push_back(linalgOp->getResult(0));
+    outs.push_back(finalEmptyTensor);
+    auto linalgSelect =
+        rewriter.create<linalg::SelectOp>(op->getLoc(), ins, outs);
+    linalgOp = linalgSelect;
+  }
+
   SmallVector<ReassociationExprs, 4> reassociationMap;
   uint64_t expandInputRank =
-      cast<ShapedType>(linalgOp.getResults()[0].getType()).getRank();
+      cast<ShapedType>(linalgOp->getResults()[0].getType()).getRank();
   reassociationMap.resize(expandInputRank);
 
   for (uint64_t i = 0; i < expandInputRank; i++) {
@@ -1151,7 +1321,7 @@ static LogicalResult reduceMatchAndRewriteHelper(Operation *op, uint64_t axis,
   // not have access to such information. This matters when handling dynamically
   // sized tensors.
   rewriter.replaceOpWithNewOp<tensor::ExpandShapeOp>(
-      op, resultTy, linalgOp.getResults()[0], reassociationMap);
+      op, resultTy, linalgOp->getResults()[0], reassociationMap);
   return success();
 }
 
@@ -2097,6 +2267,27 @@ public:
               nestedLoc, predicate, newValue, oldValue);
           auto resultIndex = rewriter.create<arith::SelectOp>(
               nestedLoc, predicate, newIndex, oldIndex);
+
+          // Check if we need to materialize compare and select for the given
+          // NaN propagation mode.
+
+          // "PROPAGATE" matches the default NaN propagation mode of the arith
+          // dialect so no compare and select is required.
+          //
+          // In the case "IGNORE" we check if the current argument is NaN and
+          // select the old index and value otherwise take the updated index and
+          // value.
+          if (const auto nanMode = argmaxOp.getNanMode(); nanMode == "IGNORE") {
+            // Unordered comparison of NaN against itself will always return
+            // true.
+            Value isNaN = rewriter.create<arith::CmpFOp>(
+                argmaxOp.getLoc(), arith::CmpFPredicate::UNO, newValue,
+                newValue);
+            resultMax = rewriter.create<arith::SelectOp>(nestedLoc, isNaN,
+                                                         oldValue, resultMax);
+            resultIndex = rewriter.create<arith::SelectOp>(
+                nestedLoc, isNaN, oldIndex, resultIndex);
+          }
           nestedBuilder.create<linalg::YieldOp>(
               nestedLoc, ValueRange({resultIndex, resultMax}));
         });
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index a8fd536..6353058 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -329,13 +329,11 @@ public:
         SmallVector<int64_t> newWeightShape;
         for (auto dim : weightPerm)
           newWeightShape.push_back(weightShape[dim]);
-        auto weightPermAttr = rewriter.getI32TensorAttr(weightPerm);
-        Value weightPermValue =
-            rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
+        auto weightPermAttr = rewriter.getDenseI32ArrayAttr(weightPerm);
         Type newWeightTy =
             RankedTensorType::get(newWeightShape, weightTy.getElementType());
         weight = rewriter.create<tosa::TransposeOp>(loc, newWeightTy, weight,
-                                                    weightPermValue);
+                                                    weightPermAttr);
       }
     }
 
@@ -353,13 +351,11 @@ public:
       SmallVector<int64_t> newWeightShape;
       for (auto dim : weightPerm)
         newWeightShape.push_back(weightShape[dim]);
-      auto weightPermAttr = rewriter.getI32TensorAttr(weightPerm);
-      Value weightPermValue =
-          rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
+      auto weightPermAttr = rewriter.getDenseI32ArrayAttr(weightPerm);
       Type newWeightTy =
           RankedTensorType::get(newWeightShape, weightTy.getElementType());
       weight = rewriter.create<tosa::TransposeOp>(loc, newWeightTy, weight,
-                                                  weightPermValue);
+                                                  weightPermAttr);
     }
 
     // Extract the attributes for convolution.
@@ -724,11 +720,44 @@ public:
       rewriter.replaceOpWithNewOp<linalg::PoolingNhwcMaxUnsignedOp>(
           op, ArrayRef<Type>{resultTy}, ValueRange{paddedInput, fakeWindowDims},
           filledEmptyTensor, strideAttr, dilationAttr);
-    } else {
-      rewriter.replaceOpWithNewOp<linalg::PoolingNhwcMaxOp>(
-          op, ArrayRef<Type>{resultTy}, ValueRange{paddedInput, fakeWindowDims},
-          filledEmptyTensor, strideAttr, dilationAttr);
+      return llvm::success();
     }
+
+    auto resultOp = rewriter.create<linalg::PoolingNhwcMaxOp>(
+        op->getLoc(), ArrayRef<Type>{resultTy},
+        ValueRange{paddedInput, fakeWindowDims}, filledEmptyTensor, strideAttr,
+        dilationAttr);
+
+    rewriter.replaceOp(op, resultOp);
+    // "PROPAGATE" mode matches the behaviour of the LinAlg named op, so no
+    // compare and select materialization is required.
+    //
+    // In the case of "IGNORE" we need to insert a compare and select. Since
+    // we've already produced a named op we will just take its body and modify
+    // it to include the appropriate checks. If the current value is NaN the
+    // old value of pool will be taken otherwise we use the result.
+    if (const auto nanMode = op.getNanMode(); nanMode == "IGNORE") {
+      auto genericOp = rewriter.create<linalg::GenericOp>(
+          op->getLoc(), resultOp.getType(0), resultOp.getInputs(),
+          resultOp.getOutputs(), resultOp.getIndexingMapsArray(),
+          resultOp.getIteratorTypesArray(),
+          [&](OpBuilder &opBuilder, Location loc, ValueRange blockArgs) {
+            IRMapping map;
+            auto oldBlock = resultOp.getRegion().begin();
+            auto oldArgs = oldBlock->getArguments();
+            auto &oldMaxOp = *resultOp.getBlock()->begin();
+            map.map(oldArgs, blockArgs);
+            auto *newOp = opBuilder.clone(oldMaxOp, map);
+            Value isNaN = opBuilder.create<arith::CmpFOp>(
+                op->getLoc(), arith::CmpFPredicate::UNO, blockArgs.front(),
+                blockArgs.front());
+            auto selectOp = opBuilder.create<arith::SelectOp>(
+                op->getLoc(), isNaN, blockArgs.back(), newOp->getResult(0));
+            opBuilder.create<linalg::YieldOp>(loc, selectOp.getResult());
+          });
+      rewriter.replaceOp(resultOp, genericOp);
+    }
+
     return success();
   }
 };
@@ -970,9 +999,7 @@ public:
 
   LogicalResult matchAndRewrite(tosa::TransposeOp op,
                                 PatternRewriter &rewriter) const final {
-    SmallVector<int32_t> constantPerms;
-    if (failed(op.getConstantPerms(constantPerms)))
-      return failure();
+    const llvm::ArrayRef<int32_t> constantPerms = op.getPerms();
 
     Location loc = op.getLoc();
     // The verifier should have made sure we have a valid TOSA permutation
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
index b97f11a..1c47936 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -15,7 +15,6 @@
 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/LoopFusionUtils.h"
 #include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Utils.h"
@@ -473,7 +472,8 @@ static Value createPrivateMemRef(AffineForOp forOp,
 //    is lower.
 // TODO: Extend profitability analysis to support scenarios with multiple
 // stores.
-static bool isFusionProfitable(AffineForOp srcForOp, Operation *srcStoreOpInst,
+static bool isFusionProfitable(AffineForOp srcForOp,
+                               ArrayRef<Operation *> producerStores,
                                AffineForOp dstForOp,
                                ArrayRef<ComputationSliceState> depthSliceUnions,
                                unsigned maxLegalFusionDepth,
@@ -503,6 +503,35 @@ static bool isFusionProfitable(AffineForOp srcForOp, Operation *srcStoreOpInst,
   if (!getLoopNestStats(dstForOp, &dstLoopNestStats))
     return false;
 
+  // We limit profitability analysis to only scenarios with
+  // a single producer store for now. Note that some multi-store
+  // producer scenarios will still go through profitability analysis
+  // if only one of the stores is involved in the producer-consumer
+  // relationship of the candidate loops.
+  // TODO: Suppport multiple producer stores in profitability
+  // analysis.
+  if (producerStores.size() > 1) {
+    LLVM_DEBUG(llvm::dbgs() << "Limited profitability analysis. Not "
+                               "supported for multiple producer store case.\n");
+    int64_t sliceCost;
+    int64_t fusedLoopNestComputeCost;
+    // We will still fuse if fusion obeys the specified compute
+    // tolerance at the max legal depth.
+    auto fraction = getAdditionalComputeFraction(
+        srcForOp, dstForOp, maxLegalFusionDepth, depthSliceUnions, sliceCost,
+        fusedLoopNestComputeCost);
+    if (!fraction || fraction > computeToleranceThreshold) {
+      LLVM_DEBUG(llvm::dbgs() << "Additional computation exceeds "
+                                 "compute tolerance. Not fusing.\n");
+      return false;
+    }
+    LLVM_DEBUG(llvm::dbgs()
+               << "Considering fusion profitable at max legal depth.\n");
+    return true;
+  }
+
+  Operation *srcStoreOp = producerStores.front();
+
   // Search for min cost value for 'dstLoopDepth'. At each value of
   // 'dstLoopDepth' from 'maxLegalLoopDepth' to '1', compute computation slice
   // bounds between 'srcOpInst' and each op in 'dstOpinsts' (taking the union
@@ -516,12 +545,9 @@ static bool isFusionProfitable(AffineForOp srcForOp, Operation *srcStoreOpInst,
   // The best loop depth at which to materialize the slice.
   std::optional<unsigned> bestDstLoopDepth;
 
-  // Compute op instance count for the src loop nest without iteration slicing.
-  uint64_t srcLoopNestCost = getComputeCost(srcForOp, srcLoopNestStats);
-
   // Compute src loop nest write region size.
-  MemRefRegion srcWriteRegion(srcStoreOpInst->getLoc());
-  if (failed(srcWriteRegion.compute(srcStoreOpInst, /*loopDepth=*/0))) {
+  MemRefRegion srcWriteRegion(srcStoreOp->getLoc());
+  if (failed(srcWriteRegion.compute(srcStoreOp, /*loopDepth=*/0))) {
     LLVM_DEBUG(llvm::dbgs()
                << "Unable to compute MemRefRegion for source operation\n");
     return false;
@@ -533,7 +559,10 @@ static bool isFusionProfitable(AffineForOp srcForOp, Operation *srcStoreOpInst,
     return false;
   int64_t srcWriteRegionSizeBytes = *maybeSrcWriteRegionSizeBytes;
 
-  // Compute op instance count for the src loop nest.
+  // Compute op instance count for the src loop nest without iteration slicing.
+  uint64_t srcLoopNestCost = getComputeCost(srcForOp, srcLoopNestStats);
+
+  // Compute op instance count for the destination loop nest.
   uint64_t dstLoopNestCost = getComputeCost(dstForOp, dstLoopNestStats);
 
   // Evaluate all depth choices for materializing the slice in the destination
@@ -563,9 +592,8 @@ static bool isFusionProfitable(AffineForOp srcForOp, Operation *srcStoreOpInst,
     // Determine what the slice write MemRefRegion would be, if the src loop
     // nest slice 'slice' were to be inserted into the dst loop nest at loop
     // depth 'i'.
-    MemRefRegion sliceWriteRegion(srcStoreOpInst->getLoc());
-    if (failed(sliceWriteRegion.compute(srcStoreOpInst, /*loopDepth=*/0,
-                                        &slice))) {
+    MemRefRegion sliceWriteRegion(srcStoreOp->getLoc());
+    if (failed(sliceWriteRegion.compute(srcStoreOp, /*loopDepth=*/0, &slice))) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to compute slice write region at loopDepth: " << i
                  << "\n");
@@ -1025,21 +1053,13 @@ public:
                     cast<AffineWriteOpInterface>(op).getMemRef()))
               producerStores.push_back(op);
 
-          // TODO: Suppport multiple producer stores in profitability
-          // analysis. We limit profitability analysis to only scenarios with
-          // a single producer store for now. Note that some multi-store
-          // producer scenarios will still go through profitability analysis
-          // if only one of the stores is involved the producer-consumer
-          // relationship of the candidate loops.
           assert(!producerStores.empty() && "Expected producer store");
-          if (producerStores.size() > 1)
-            LLVM_DEBUG(llvm::dbgs() << "Skipping profitability analysis. Not "
-                                       "supported for this case\n");
-          else if (!isFusionProfitable(srcAffineForOp, producerStores[0],
-                                       dstAffineForOp, depthSliceUnions,
-                                       maxLegalFusionDepth, &bestDstLoopDepth,
-                                       computeToleranceThresholdToUse))
+          if (!isFusionProfitable(srcAffineForOp, producerStores,
+                                  dstAffineForOp, depthSliceUnions,
+                                  maxLegalFusionDepth, &bestDstLoopDepth,
+                                  computeToleranceThresholdToUse)) {
             continue;
+          }
         }
 
         assert(bestDstLoopDepth > 0 && "Unexpected loop fusion depth");
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
index 7784069..edd6bcf 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
@@ -300,7 +300,7 @@ static bool hasTensorSignature(func::FuncOp funcOp) {
 /// Store all functions of the `moduleOp` in `orderedFuncOps`, sorted by
 /// callee-caller order (i.e., callees without callers first). Store all
 /// remaining functions (i.e., the ones that call each other recursively) in
-/// `remainingFuncOps`.
+/// `remainingFuncOps`. Does not traverse nested symbol tables.
 ///
 /// Store the map of FuncOp to all its callers in `callerMap`.
 ///
@@ -314,10 +314,10 @@ static LogicalResult getFuncOpsOrderedByCalls(
   DenseMap<func::FuncOp, DenseSet<func::FuncOp>> calledBy;
   // For each FuncOp, the number of func::CallOp it contains.
   DenseMap<func::FuncOp, unsigned> numberCallOpsContainedInFuncOp;
-  WalkResult res = moduleOp.walk([&](func::FuncOp funcOp) -> WalkResult {
+  for (func::FuncOp funcOp : moduleOp.getOps<func::FuncOp>()) {
     // Collect function calls and populate the caller map.
     numberCallOpsContainedInFuncOp[funcOp] = 0;
-    return funcOp.walk([&](func::CallOp callOp) -> WalkResult {
+    WalkResult res = funcOp.walk([&](func::CallOp callOp) -> WalkResult {
       func::FuncOp calledFunction = getCalledFunction(callOp);
       assert(calledFunction && "could not retrieved called func::FuncOp");
       // If the called function does not have any tensors in its signature, then
@@ -331,9 +331,9 @@ static LogicalResult getFuncOpsOrderedByCalls(
       }
       return WalkResult::advance();
     });
-  });
-  if (res.wasInterrupted())
-    return failure();
+    if (res.wasInterrupted())
+      return failure();
+  }
 
   // Iteratively remove function operations that do not call any of the
   // functions remaining in the callCounter map and add them to ordered list.
@@ -498,10 +498,10 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp,
 
 void mlir::bufferization::removeBufferizationAttributesInModule(
     ModuleOp moduleOp) {
-  moduleOp.walk([&](func::FuncOp op) {
+  for (auto op : moduleOp.getOps<func::FuncOp>()) {
     for (BlockArgument bbArg : op.getArguments())
       removeBufferizationAttributes(bbArg);
-  });
+  }
 }
 
 LogicalResult mlir::bufferization::bufferizeModuleOp(
@@ -557,7 +557,7 @@ LogicalResult mlir::bufferization::bufferizeModuleOp(
   // Bufferize all other ops.
   for (Operation &op : llvm::make_early_inc_range(moduleOp.getOps())) {
     // Functions were already bufferized.
-    if (isa<func::FuncOp>(&op))
+    if (isa<func::FuncOp>(&op) || op.hasTrait<OpTrait::SymbolTable>())
       continue;
     if (failed(bufferizeOp(&op, options, statistics)))
       return failure();
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
index 6db60b7..4326b19 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
@@ -52,14 +52,23 @@ mlir::bufferization::insertTensorCopies(Operation *op,
                                         const AnalysisState &state) {
   IRRewriter rewriter(op->getContext());
 
-  WalkResult result = op->walk([&](Operation *op) {
-    auto bufferizableOp = state.getOptions().dynCastBufferizableOp(op);
+  // It may be more efficient to walk in pre-order here, but the current
+  // implementation visits regions of ops even if they are not allowed or
+  // bufferizable, and existing tests rely on this behavior.
+  // For now, only exclude nested operations if they are in a different symbol
+  // table scope.
+  WalkResult result = op->walk([&](Operation *nestedOp) {
+    if (op->hasTrait<OpTrait::SymbolTable>() &&
+        nestedOp->getParentWithTrait<OpTrait::SymbolTable>() != op)
+      return WalkResult::skip();
+
+    auto bufferizableOp = state.getOptions().dynCastBufferizableOp(nestedOp);
     if (!bufferizableOp)
       return WalkResult::skip();
 
     // Find inplacability conflicts and resolve them. (Typically with explicit
     // tensor copies in the form of AllocTensorOps.)
-    rewriter.setInsertionPoint(op);
+    rewriter.setInsertionPoint(nestedOp);
     if (failed(bufferizableOp.resolveConflicts(rewriter, state)))
       return WalkResult::interrupt();
 
diff --git a/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt
index ae8ad5a..235beb0 100644
--- a/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt
@@ -8,6 +8,7 @@ add_mlir_dialect_library(MLIRSPIRVDialect
   ControlFlowOps.cpp
   CooperativeMatrixOps.cpp
   GroupOps.cpp
+  ImageOps.cpp
   IntegerDotProductOps.cpp
   MemoryOps.cpp
   MeshOps.cpp
diff --git a/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp b/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp
new file mode 100644
index 0000000..daf6d84
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp
@@ -0,0 +1,138 @@
+//===- ImageOps.cpp - MLIR SPIR-V Image Ops  ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the image operations in the SPIR-V dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Common utility functions
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verifyImageOperands(Operation *imageOp,
+                                         spirv::ImageOperandsAttr attr,
+                                         Operation::operand_range operands) {
+  if (!attr) {
+    if (operands.empty())
+      return success();
+
+    return imageOp->emitError("the Image Operands should encode what operands "
+                              "follow, as per Image Operands");
+  }
+
+  // TODO: Add the validation rules for the following Image Operands.
+  spirv::ImageOperands noSupportOperands =
+      spirv::ImageOperands::Bias | spirv::ImageOperands::Lod |
+      spirv::ImageOperands::Grad | spirv::ImageOperands::ConstOffset |
+      spirv::ImageOperands::Offset | spirv::ImageOperands::ConstOffsets |
+      spirv::ImageOperands::Sample | spirv::ImageOperands::MinLod |
+      spirv::ImageOperands::MakeTexelAvailable |
+      spirv::ImageOperands::MakeTexelVisible |
+      spirv::ImageOperands::SignExtend | spirv::ImageOperands::ZeroExtend;
+
+  assert(!spirv::bitEnumContainsAny(attr.getValue(), noSupportOperands) &&
+         "unimplemented operands of Image Operands");
+  (void)noSupportOperands;
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// spirv.ImageDrefGather
+//===----------------------------------------------------------------------===//
+
+LogicalResult spirv::ImageDrefGatherOp::verify() {
+  return verifyImageOperands(getOperation(), getImageOperandsAttr(),
+                             getOperandArguments());
+}
+
+//===----------------------------------------------------------------------===//
+// spirv.ImageWriteOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult spirv::ImageWriteOp::verify() {
+  // TODO: Do we need check for: "If the Arrayed operand is 1, then additional
+  // capabilities may be required; e.g., ImageCubeArray, or ImageMSArray."?
+
+  // TODO: Ideally it should be somewhere verified that "The Image Format must
+  // not be Unknown, unless the StorageImageWriteWithoutFormat Capability was
+  // declared." This function however may not be the suitable place for such
+  // verification.
+
+  return verifyImageOperands(getOperation(), getImageOperandsAttr(),
+                             getOperandArguments());
+}
+
+//===----------------------------------------------------------------------===//
+// spirv.ImageQuerySize
+//===----------------------------------------------------------------------===//
+
+LogicalResult spirv::ImageQuerySizeOp::verify() {
+  spirv::ImageType imageType =
+      llvm::cast<spirv::ImageType>(getImage().getType());
+  Type resultType = getResult().getType();
+
+  spirv::Dim dim = imageType.getDim();
+  spirv::ImageSamplingInfo samplingInfo = imageType.getSamplingInfo();
+  spirv::ImageSamplerUseInfo samplerInfo = imageType.getSamplerUseInfo();
+  switch (dim) {
+  case spirv::Dim::Dim1D:
+  case spirv::Dim::Dim2D:
+  case spirv::Dim::Dim3D:
+  case spirv::Dim::Cube:
+    if (samplingInfo != spirv::ImageSamplingInfo::MultiSampled &&
+        samplerInfo != spirv::ImageSamplerUseInfo::SamplerUnknown &&
+        samplerInfo != spirv::ImageSamplerUseInfo::NoSampler)
+      return emitError(
+          "if Dim is 1D, 2D, 3D, or Cube, "
+          "it must also have either an MS of 1 or a Sampled of 0 or 2");
+    break;
+  case spirv::Dim::Buffer:
+  case spirv::Dim::Rect:
+    break;
+  default:
+    return emitError("the Dim operand of the image type must "
+                     "be 1D, 2D, 3D, Buffer, Cube, or Rect");
+  }
+
+  unsigned componentNumber = 0;
+  switch (dim) {
+  case spirv::Dim::Dim1D:
+  case spirv::Dim::Buffer:
+    componentNumber = 1;
+    break;
+  case spirv::Dim::Dim2D:
+  case spirv::Dim::Cube:
+  case spirv::Dim::Rect:
+    componentNumber = 2;
+    break;
+  case spirv::Dim::Dim3D:
+    componentNumber = 3;
+    break;
+  default:
+    break;
+  }
+
+  if (imageType.getArrayedInfo() == spirv::ImageArrayedInfo::Arrayed)
+    componentNumber += 1;
+
+  unsigned resultComponentNumber = 1;
+  if (auto resultVectorType = llvm::dyn_cast<VectorType>(resultType))
+    resultComponentNumber = resultVectorType.getNumElements();
+
+  if (componentNumber != resultComponentNumber)
+    return emitError("expected the result to have ")
+           << componentNumber << " component(s), but found "
+           << resultComponentNumber << " component(s)";
+
+  return success();
+}
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
index dc41433..da9855b 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
@@ -166,34 +166,6 @@ static void printOneResultOp(Operation *op, OpAsmPrinter &p) {
   p << " : " << resultType;
 }
 
-template <typename Op>
-static LogicalResult verifyImageOperands(Op imageOp,
-                                         spirv::ImageOperandsAttr attr,
-                                         Operation::operand_range operands) {
-  if (!attr) {
-    if (operands.empty())
-      return success();
-
-    return imageOp.emitError("the Image Operands should encode what operands "
-                             "follow, as per Image Operands");
-  }
-
-  // TODO: Add the validation rules for the following Image Operands.
-  spirv::ImageOperands noSupportOperands =
-      spirv::ImageOperands::Bias | spirv::ImageOperands::Lod |
-      spirv::ImageOperands::Grad | spirv::ImageOperands::ConstOffset |
-      spirv::ImageOperands::Offset | spirv::ImageOperands::ConstOffsets |
-      spirv::ImageOperands::Sample | spirv::ImageOperands::MinLod |
-      spirv::ImageOperands::MakeTexelAvailable |
-      spirv::ImageOperands::MakeTexelVisible |
-      spirv::ImageOperands::SignExtend | spirv::ImageOperands::ZeroExtend;
-
-  if (spirv::bitEnumContainsAll(attr.getValue(), noSupportOperands))
-    llvm_unreachable("unimplemented operands of Image Operands");
-
-  return success();
-}
-
 template <typename BlockReadWriteOpTy>
 static LogicalResult verifyBlockReadWritePtrAndValTypes(BlockReadWriteOpTy op,
                                                         Value ptr, Value val) {
@@ -2003,85 +1975,6 @@ LogicalResult spirv::GLLdexpOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// spirv.ImageDrefGather
-//===----------------------------------------------------------------------===//
-
-LogicalResult spirv::ImageDrefGatherOp::verify() {
-  VectorType resultType = llvm::cast<VectorType>(getResult().getType());
-  auto sampledImageType =
-      llvm::cast<spirv::SampledImageType>(getSampledimage().getType());
-  auto imageType =
-      llvm::cast<spirv::ImageType>(sampledImageType.getImageType());
-
-  if (resultType.getNumElements() != 4)
-    return emitOpError("result type must be a vector of four components");
-
-  Type elementType = resultType.getElementType();
-  Type sampledElementType = imageType.getElementType();
-  if (!llvm::isa<NoneType>(sampledElementType) &&
-      elementType != sampledElementType)
-    return emitOpError(
-        "the component type of result must be the same as sampled type of the "
-        "underlying image type");
-
-  spirv::Dim imageDim = imageType.getDim();
-  spirv::ImageSamplingInfo imageMS = imageType.getSamplingInfo();
-
-  if (imageDim != spirv::Dim::Dim2D && imageDim != spirv::Dim::Cube &&
-      imageDim != spirv::Dim::Rect)
-    return emitOpError(
-        "the Dim operand of the underlying image type must be 2D, Cube, or "
-        "Rect");
-
-  if (imageMS != spirv::ImageSamplingInfo::SingleSampled)
-    return emitOpError("the MS operand of the underlying image type must be 0");
-
-  spirv::ImageOperandsAttr attr = getImageoperandsAttr();
-  auto operandArguments = getOperandArguments();
-
-  return verifyImageOperands(*this, attr, operandArguments);
-}
-
-//===----------------------------------------------------------------------===//
-// spirv.ImageWriteOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult spirv::ImageWriteOp::verify() {
-  ImageType imageType = cast<ImageType>(getImage().getType());
-  Type sampledType = imageType.getElementType();
-  ImageSamplerUseInfo samplerInfo = imageType.getSamplerUseInfo();
-
-  if (!llvm::is_contained({spirv::ImageSamplerUseInfo::SamplerUnknown,
-                           spirv::ImageSamplerUseInfo::NoSampler},
-                          samplerInfo)) {
-    return emitOpError(
-        "the sampled operand of the underlying image must be 0 or 2");
-  }
-
-  // TODO: Do we need check for: "If the Arrayed operand is 1, then additional
-  // capabilities may be required; e.g., ImageCubeArray, or ImageMSArray."?
-
-  if (imageType.getDim() == spirv::Dim::SubpassData) {
-    return emitOpError(
-        "the Dim operand of the underlying image must not be SubpassData");
-  }
-
-  Type texelType = getElementTypeOrSelf(getTexel());
-  if (!isa<NoneType>(sampledType) && texelType != sampledType) {
-    return emitOpError(
-        "the texel component type must match the image sampled type");
-  }
-
-  // TODO: Ideally it should be somewhere verified that "The Image Format must
-  // not be Unknown, unless the StorageImageWriteWithoutFormat Capability was
-  // declared." This function however may not be the suitable place for such
-  // verification.
-
-  return verifyImageOperands(*this, getImageOperandsAttr(),
-                             getOperandArguments());
-}
-
-//===----------------------------------------------------------------------===//
 // spirv.ShiftLeftLogicalOp
 //===----------------------------------------------------------------------===//
 
@@ -2106,71 +1999,6 @@ LogicalResult spirv::ShiftRightLogicalOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// spirv.ImageQuerySize
-//===----------------------------------------------------------------------===//
-
-LogicalResult spirv::ImageQuerySizeOp::verify() {
-  spirv::ImageType imageType =
-      llvm::cast<spirv::ImageType>(getImage().getType());
-  Type resultType = getResult().getType();
-
-  spirv::Dim dim = imageType.getDim();
-  spirv::ImageSamplingInfo samplingInfo = imageType.getSamplingInfo();
-  spirv::ImageSamplerUseInfo samplerInfo = imageType.getSamplerUseInfo();
-  switch (dim) {
-  case spirv::Dim::Dim1D:
-  case spirv::Dim::Dim2D:
-  case spirv::Dim::Dim3D:
-  case spirv::Dim::Cube:
-    if (samplingInfo != spirv::ImageSamplingInfo::MultiSampled &&
-        samplerInfo != spirv::ImageSamplerUseInfo::SamplerUnknown &&
-        samplerInfo != spirv::ImageSamplerUseInfo::NoSampler)
-      return emitError(
-          "if Dim is 1D, 2D, 3D, or Cube, "
-          "it must also have either an MS of 1 or a Sampled of 0 or 2");
-    break;
-  case spirv::Dim::Buffer:
-  case spirv::Dim::Rect:
-    break;
-  default:
-    return emitError("the Dim operand of the image type must "
-                     "be 1D, 2D, 3D, Buffer, Cube, or Rect");
-  }
-
-  unsigned componentNumber = 0;
-  switch (dim) {
-  case spirv::Dim::Dim1D:
-  case spirv::Dim::Buffer:
-    componentNumber = 1;
-    break;
-  case spirv::Dim::Dim2D:
-  case spirv::Dim::Cube:
-  case spirv::Dim::Rect:
-    componentNumber = 2;
-    break;
-  case spirv::Dim::Dim3D:
-    componentNumber = 3;
-    break;
-  default:
-    break;
-  }
-
-  if (imageType.getArrayedInfo() == spirv::ImageArrayedInfo::Arrayed)
-    componentNumber += 1;
-
-  unsigned resultComponentNumber = 1;
-  if (auto resultVectorType = llvm::dyn_cast<VectorType>(resultType))
-    resultComponentNumber = resultVectorType.getNumElements();
-
-  if (componentNumber != resultComponentNumber)
-    return emitError("expected the result to have ")
-           << componentNumber << " component(s), but found "
-           << resultComponentNumber << " component(s)";
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
 // spirv.VectorTimesScalarOp
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index fad7db4..e3c8899 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -10,7 +10,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/Dialect/Linalg/IR/RelayoutOpInterface.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 9bfc2aa..4da1a7b 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -88,13 +88,10 @@ struct ConsolidateTransposeOptimization
       return rewriter.notifyMatchFailure(transposeOp,
                                          "input must be transpose operation");
 
-    SmallVector<int32_t> transposePerms, innerTransposePerms;
-    if (transposeOp.getConstantPerms(transposePerms).failed())
-      return rewriter.notifyMatchFailure(transposeOp,
-                                         "transpose perms must be constant");
-    if (innerTranspose.getConstantPerms(innerTransposePerms).failed())
-      return rewriter.notifyMatchFailure(
-          transposeOp, "inner transpose perms must be constant");
+    const llvm::ArrayRef<int32_t> transposePerms = transposeOp.getPerms();
+    const llvm::ArrayRef<int32_t> innerTransposePerms =
+        innerTranspose.getPerms();
+
     if (transposePerms.size() != innerTransposePerms.size())
       return rewriter.notifyMatchFailure(
           transposeOp,
@@ -108,15 +105,9 @@ struct ConsolidateTransposeOptimization
     for (int i = 0, s = transposePerms.size(); i < s; ++i)
       perms[i] = innerTransposePerms[transposePerms[i]];
 
-    auto permsTy =
-        RankedTensorType::get(transposePerms.size(), rewriter.getI32Type());
-    auto permsAttr = DenseIntElementsAttr::get(permsTy, perms);
-    Value permsValue = rewriter.create<tosa::ConstOp>(transposeOp.getLoc(),
-                                                      permsTy, permsAttr);
-
     rewriter.replaceOpWithNewOp<tosa::TransposeOp>(
         transposeOp, transposeOp.getResult().getType(),
-        innerTranspose.getInput1(), permsValue);
+        innerTranspose.getInput1(), rewriter.getDenseI32ArrayAttr(perms));
 
     return success();
   }
@@ -128,10 +119,6 @@ struct TransposeIsReshape : public OpRewritePattern<tosa::TransposeOp> {
 
   LogicalResult matchAndRewrite(tosa::TransposeOp op,
                                 PatternRewriter &rewriter) const override {
-    DenseIntElementsAttr permAttr;
-    if (!matchPattern(op.getPerms(), m_Constant(&permAttr)))
-      return rewriter.notifyMatchFailure(op, "Non-constant permutation");
-
     if (op.getInput1().getDefiningOp<tosa::TransposeOp>())
       return rewriter.notifyMatchFailure(
           op, "Src is from transpose, can compose transposes");
@@ -156,9 +143,7 @@ struct TransposeIsReshape : public OpRewritePattern<tosa::TransposeOp> {
     if (numDynDims > 1)
       return rewriter.notifyMatchFailure(op, "Has more than one dynamic dim.");
 
-    SmallVector<int64_t> permValues = llvm::to_vector<6>(
-        llvm::map_range(permAttr.getValues<APInt>(),
-                        [](const APInt &val) { return val.getSExtValue(); }));
+    const llvm::ArrayRef<int32_t> permValues = op.getPerms();
 
     SmallVector<int64_t> nonZeroPerms;
     nonZeroPerms.reserve(permValues.size());
@@ -680,10 +665,11 @@ OpFoldResult IntDivOp::fold(FoldAdaptor adaptor) {
       return getInput1();
   }
 
-  if (rhsAttr && lhsAttr && rhsAttr.isSplat() && lhsAttr.isSplat()) {
-    if (llvm::isa<IntegerType>(resultETy)) {
-      APInt l = lhsAttr.getSplatValue<APInt>();
-      APInt r = rhsAttr.getSplatValue<APInt>();
+  if (rhsAttr && lhsAttr && rhsAttr.isSplat() && lhsAttr.isSplat() &&
+      llvm::isa<IntegerType>(resultETy)) {
+    APInt l = lhsAttr.getSplatValue<APInt>();
+    APInt r = rhsAttr.getSplatValue<APInt>();
+    if (!r.isZero()) {
       APInt result = l.sdiv(r);
       return DenseElementsAttr::get(resultTy, result);
     }
@@ -1175,9 +1161,7 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
   }
 
   // Transpose is not the identity transpose.
-  SmallVector<int32_t> perms;
-  if (getConstantPerms(perms).failed())
-    return {};
+  const llvm::ArrayRef<int32_t> perms = getPerms();
 
   if (!llvm::equal(llvm::seq<int32_t>(0, perms.size()), perms))
     return {};
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index e9c33e1..20346b4 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -291,12 +291,10 @@ static LogicalResult verifyConvOp(T op) {
   ElementsAttr weightZpAttr;
   if (!matchPattern(op.getInputZp(), m_Constant(&inputZpAttr)) ||
       !matchPattern(op.getWeightZp(), m_Constant(&weightZpAttr))) {
-    op.emitOpError(
-        "bail out if the actual value of zero points cannot be determined");
-    return failure();
+    return success();
   }
 
-  // Get and verify explicit zero points.
+  // Get and verify explicit constant zero points.
   int64_t inputZpVal;
   int64_t weightZpVal;
 
@@ -1374,54 +1372,37 @@ llvm::LogicalResult tosa::ReshapeOp::verify() {
   return mlir::success();
 }
 
-LogicalResult tosa::TransposeOp::getConstantPerms(SmallVector<int32_t> &perms) {
-  // Perms must be constants.
-  DenseIntElementsAttr permsAttr;
-  if (!matchPattern(getPerms(), m_Constant(&permsAttr)))
-    return failure();
-
-  perms.clear();
-  for (auto v : permsAttr.getValues<APInt>())
-    perms.push_back(v.getSExtValue());
-
-  return success();
-}
-
 LogicalResult tosa::TransposeOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
     TransposeOp::Adaptor adaptor,
     SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
   ShapeAdaptor inputShape(adaptor.getInput1().getType());
-  ShapeAdaptor permsShape(adaptor.getPerms().getType());
-
-  // We cannot infer anything from a rank-0 "permutation" tensor.
-  if (permsShape.hasRank() && permsShape.getRank() == 0)
-    return failure();
 
   // If input rank and permutation length is unknown, the output rank is
   // unknown.
-  if (!inputShape.hasRank() || !permsShape.hasRank() ||
-      permsShape.isDynamicDim(0)) {
+  if (!inputShape.hasRank()) {
     inferredReturnShapes.push_back(ShapedTypeComponents());
     return success();
   }
 
+  const auto inputRank = inputShape.getRank();
+
   // This would imply the number of permutations does not match the rank of
   // the input which is illegal.
-  if (permsShape.getDimSize(0) != inputShape.getRank()) {
+  if (adaptor.getPerms().size() != static_cast<size_t>(inputRank)) {
     return failure();
   }
 
   SmallVector<int64_t> outputShape;
   // Rank-0 means no permutations matter.
-  if (inputShape.getRank() == 0) {
+  if (inputRank == 0) {
     inferredReturnShapes.push_back(ShapedTypeComponents(outputShape));
     return success();
   }
 
   // Check whether the input dimensions are all the same.
   bool allTheSame = true;
-  for (int i = 1, s = inputShape.getRank(); i < s; i++) {
+  for (int i = 1, s = inputRank; i < s; i++) {
     if (inputShape.getDimSize(0) != inputShape.getDimSize(i)) {
       allTheSame = false;
       break;
@@ -1431,34 +1412,21 @@ LogicalResult tosa::TransposeOp::inferReturnTypeComponents(
   // If all of the input dimensions are the same we don't care about the
   // permutation.
   if (allTheSame) {
-    outputShape.resize(inputShape.getRank(), inputShape.getDimSize(0));
+    outputShape.resize(inputRank, inputShape.getDimSize(0));
     inferredReturnShapes.push_back(ShapedTypeComponents(outputShape));
     return success();
   }
 
-  outputShape.resize(inputShape.getRank(), ShapedType::kDynamic);
-  // If the permuations are a constant we can directly determine the output
-  // shape.
-  DenseIntElementsAttr attr;
-  if (matchPattern(adaptor.getPerms(), m_Constant(&attr)) &&
-      attr.getType().getRank() == 1) {
-    ShapeAdaptor permShape = attr;
-    // Constant permutation must be the same length as the input rank.
-    if (inputShape.getRank() != permShape.getRank())
-      return emitOptionalError(location,
-                               "constant permutation must be the same length"
-                               " as the input rank");
-
-    // Constant permutation values must be within the input rank.
-    for (int i = 0, e = inputShape.getRank(); i < e; i++) {
-      if (inputShape.getRank() <= permShape.getDimSize(i))
-        return failure();
-    }
+  outputShape.resize(inputRank, ShapedType::kDynamic);
 
-    outputShape.reserve(inputShape.getRank());
-    for (int i = 0, s = inputShape.getRank(); i < s; i++) {
-      outputShape[i] = inputShape.getDimSize(permShape.getDimSize(i));
-    }
+  // Constant permutation values must be within the input rank.
+  if (llvm::any_of(adaptor.getPerms(),
+                   [inputRank](const auto i) { return i >= inputRank; }))
+    return failure();
+
+  outputShape.reserve(inputRank);
+  for (int i = 0, s = inputRank; i < s; i++) {
+    outputShape[i] = inputShape.getDimSize(adaptor.getPerms()[i]);
   }
 
   inferredReturnShapes.push_back(ShapedTypeComponents(outputShape));
@@ -1467,75 +1435,60 @@ LogicalResult tosa::TransposeOp::inferReturnTypeComponents(
 
 LogicalResult tosa::TransposeOp::verify() {
   TensorType inputType = getInput1().getType();
-  TensorType permType = getPerms().getType();
   TensorType outputType = getOutput().getType();
+  const llvm::ArrayRef<int32_t> constantPerms = getPerms();
 
-  if (permType.hasRank() && permType.getRank() != 1)
-    return emitOpError()
-           << "expected permutation tensor to be rank 1 but got rank "
-           << permType.getRank();
-  if (inputType.hasRank() && permType.hasRank())
-    if (!permType.isDynamicDim(0) &&
-        permType.getDimSize(0) != inputType.getRank())
-      return emitOpError() << "expected permutation tensor dim 0 to have size "
-                           << inputType.getRank()
-                           << " (input rank) but got size "
-                           << permType.getDimSize(0);
+  if (inputType.hasRank() &&
+      constantPerms.size() != static_cast<size_t>(inputType.getRank()))
+    return emitOpError() << "expected perms attribute to have size "
+                         << inputType.getRank() << " (input rank) but got size "
+                         << constantPerms.size();
   if (inputType.hasRank() && outputType.hasRank() &&
       inputType.getRank() != outputType.getRank())
     return emitOpError()
            << "expected input tensor rank to equal result tensor rank";
-  if (outputType.hasRank() && permType.hasRank())
-    if (!permType.isDynamicDim(0) &&
-        permType.getDimSize(0) != outputType.getRank())
-      return emitOpError() << "expected permutation tensor dim 0 to have size "
-                           << outputType.getRank()
-                           << " (output rank) but got size "
-                           << permType.getDimSize(0);
-
-  SmallVector<int32_t> constantPerms;
-  if (succeeded(getConstantPerms(constantPerms))) {
-    // Assert that the permutation tensor has a rank, which means that the
-    // rank has been verified above.
-    assert(permType.hasRank() &&
-           "Unexpectedly found permutation tensor without rank");
-    if (!llvm::all_of(constantPerms,
-                      [&constantPerms](int32_t s) {
-                        return s >= 0 &&
-                               static_cast<size_t>(s) < constantPerms.size();
-                      }) ||
-        !isPermutationVector(llvm::to_vector(llvm::map_range(
-            constantPerms, [](int32_t v) -> int64_t { return v; }))))
-      return emitOpError() << "expected valid permutation tensor";
-
-    // Verify that the types of the input and output tensors are properly
-    // permuted.
-    if (inputType.hasRank() && outputType.hasRank()) {
-      assert(constantPerms.size() == static_cast<size_t>(inputType.getRank()) &&
-             inputType.getRank() == outputType.getRank());
-
-      for (auto i = 0; i < outputType.getRank(); i++) {
-        if (inputType.isDynamicDim(constantPerms[i]) ||
-            outputType.isDynamicDim(i))
-          continue;
-
-        if (inputType.getDimSize(constantPerms[i]) != outputType.getDimSize(i))
-          return emitOpError()
-                 << "expected output tensor dim " << i << " to match "
-                 << "input dim " << constantPerms[i] << " with value of "
-                 << inputType.getDimSize(constantPerms[i]);
-      }
+  if (outputType.hasRank() &&
+      constantPerms.size() != static_cast<size_t>(outputType.getRank()))
+    return emitOpError() << "expected perms attribute to have size "
+                         << outputType.getRank()
+                         << " (output rank) but got size "
+                         << constantPerms.size();
+
+  if (!llvm::all_of(constantPerms,
+                    [&constantPerms](int32_t s) {
+                      return s >= 0 &&
+                             static_cast<size_t>(s) < constantPerms.size();
+                    }) ||
+      !isPermutationVector(llvm::to_vector(llvm::map_range(
+          constantPerms, [](int32_t v) -> int64_t { return v; }))))
+    return emitOpError() << "expected valid permutation indices";
+
+  // Verify that the types of the input and output tensors are properly
+  // permuted.
+  if (inputType.hasRank() && outputType.hasRank()) {
+    assert(constantPerms.size() == static_cast<size_t>(inputType.getRank()) &&
+           inputType.getRank() == outputType.getRank());
+
+    for (auto i = 0; i < outputType.getRank(); i++) {
+      if (inputType.isDynamicDim(constantPerms[i]) ||
+          outputType.isDynamicDim(i))
+        continue;
+
+      if (inputType.getDimSize(constantPerms[i]) != outputType.getDimSize(i))
+        return emitOpError()
+               << "expected output tensor dim " << i << " to match "
+               << "input dim " << constantPerms[i] << " with value of "
+               << inputType.getDimSize(constantPerms[i]);
     }
   }
+
   return success();
 }
 
 LogicalResult TransposeOp::reifyResultShapes(
     OpBuilder &builder, ReifiedRankedShapedTypeDims &reifiedReturnShapes) {
 
-  SmallVector<int32_t> transposePerms;
-  if (getConstantPerms(transposePerms).failed())
-    return failure();
+  const llvm::ArrayRef<int32_t> transposePerms = getPerms();
 
   Value input = getInput1();
   auto inputType = cast<TensorType>(input.getType());
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
index 26baddc..61011b6 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
@@ -166,13 +166,9 @@ public:
         getTosaConstShape(rewriter, loc, weightReshapeDims0));
 
     // Transpose the factored-out stride to the output channels.
-    Value transposeWeightVal = rewriter.create<tosa::ConstOp>(
-        loc, RankedTensorType::get({6}, rewriter.getI32Type()),
-        rewriter.getI32TensorAttr({2, 4, 0, 1, 3, 5}));
-
     weight = CreateOpAndInferShape<tosa::TransposeOp>(
         rewriter, loc, UnrankedTensorType::get(weightETy), weight,
-        transposeWeightVal);
+        rewriter.getDenseI32ArrayAttr({2, 4, 0, 1, 3, 5}));
 
     // Collapse the strides and output channels into a single dimension.
     llvm::SmallVector<int64_t, 4> weightReshapeDims1 = {
@@ -269,13 +265,9 @@ public:
         convReshapeDims0Value);
 
     // Transpose the factored-out stride to the output channels.
-    Value transposeConvVal = rewriter.create<tosa::ConstOp>(
-        loc, RankedTensorType::get({6}, rewriter.getI32Type()),
-        rewriter.getI32TensorAttr({0, 1, 3, 2, 4, 5}));
-
     conv2d = CreateOpAndInferShape<tosa::TransposeOp>(
         rewriter, loc, UnrankedTensorType::get(convETy), conv2d,
-        transposeConvVal);
+        rewriter.getDenseI32ArrayAttr({0, 1, 3, 2, 4, 5}));
 
     // Fuse striding behavior back into width / height.
     llvm::SmallVector<int64_t, 6> convReshapeDims1 = {
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp
index 403ac48..43e9507 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp
@@ -224,13 +224,8 @@ struct TosaFoldConstantTranspose : public OpRewritePattern<tosa::TransposeOp> {
     if (!llvm::hasSingleElement(op.getInput1().getDefiningOp()->getUsers()))
       return failure();
 
-    DenseIntElementsAttr permAttr;
-    if (!matchPattern(op.getPerms(), m_Constant(&permAttr)))
-      return failure();
     auto permValues = llvm::map_to_vector(
-        // TOSA allows both 32- and 64-bit integer tensors here.
-        permAttr.getValues<APInt>(),
-        [](const APInt &val) { return val.getSExtValue(); });
+        op.getPerms(), [](const int32_t v) { return static_cast<int64_t>(v); });
 
     auto inputType = cast<ShapedType>(op.getInput1().getType());
 
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp
index 64e5c31..36f511f 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp
@@ -367,9 +367,7 @@ std::optional<Value> TosaReduceTransposes::buildMappedToValue(
 std::optional<Value> TosaReduceTransposes::buildMappedToValue(
     TransposeOp transposeOp, const DenseMap<Value, Value> &valuesMap,
     IRRewriter &rewriter, ArrayRef<int32_t> hoistedPerms) {
-  SmallVector<int32_t> perms;
-  if (failed(transposeOp.getConstantPerms(perms)) ||
-      !areInvolutionTransposes(hoistedPerms, perms))
+  if (!areInvolutionTransposes(hoistedPerms, transposeOp.getPerms()))
     return std::nullopt;
   return transposeOp.getInput1();
 }
@@ -506,14 +504,11 @@ bool TosaReduceTransposes::dependenciesAreValid(
       // replaced.
       Operation *user = use.getOwner();
       if (auto otherTranspose = llvm::dyn_cast<TransposeOp>(user)) {
-        SmallVector<int32_t> otherPerms;
-
         // Can later think about cases where transpose -> transpose
         // or reshape -> transpose, where the transposes are not necessarily
         // the same perms as the hoisted, if implementing a more general
         // transform. These could be permitted.
-        if (failed(otherTranspose.getConstantPerms(otherPerms)) ||
-            !llvm::equal(perms, otherPerms))
+        if (!llvm::equal(perms, otherTranspose.getPerms()))
           return false;
       } else if (userNotContainedInValidTransposeDependencies(
                      user, validTransposes, transposeInfo)) {
@@ -607,9 +602,8 @@ void TosaReduceTransposes::runOnOperation() {
         !llvm::isa<RankedTensorType>(output.getType()))
       return;
 
-    // No transformation when transpose permutation non-constant.
-    if (failed(transposeOp.getConstantPerms(perms)))
-      return;
+    llvm::for_each(transposeOp.getPerms(),
+                   [&perms](const auto i) { perms.emplace_back(i); });
 
     // We let --canonicalize deal with identity transpose.
     if (llvm::equal(llvm::seq<int32_t>(0, perms.size()), perms))
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index f74a4b4..708b3fc3 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -56,15 +56,6 @@ static LogicalResult checkConstantOperandPad(Operation *op) {
   return success();
 }
 
-static LogicalResult checkConstantOperandTranspose(Operation *op) {
-  if (auto transposeOp = dyn_cast<tosa::TransposeOp>(op)) {
-    DenseElementsAttr perms;
-    if (!matchPattern(transposeOp.getPerms(), m_Constant(&perms)))
-      return op->emitOpError("perms of transpose is not constant");
-  }
-  return success();
-}
-
 struct TosaLevel {
   int32_t MAX_RANK = 0;
   int32_t MAX_KERNEL = 0;
@@ -118,7 +109,6 @@ public:
 private:
   void populateConstantOperandChecks() {
     constCheckers.emplace_back(checkConstantOperandPad);
-    constCheckers.emplace_back(checkConstantOperandTranspose);
   }
 
   bool levelCheckKernel(Operation *op, int32_t v,
@@ -425,7 +415,7 @@ private:
         } else {
           llvm::errs() << "unknown TOSA extension name passed in: " << ext
                        << ", supported extension are int16, int4, bf16, "
-                       << "fp8e4m3, fp8e5m2, fft, and variable\n";
+                       << "fp8e4m3, fp8e5m2, fft, variable and controlflow\n";
           return signalPassFailure();
         }
       }
diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
index d1a8732..3a51939 100644
--- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
+++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
@@ -77,24 +77,21 @@ computeReshapeOutput(ArrayRef<int64_t> higherRankShape,
   // Initialize new shapes with [1] * higherRank.
   int64_t higherRank = higherRankShape.size();
   int64_t lowerRank = lowerRankShape.size();
-
   reshapeOutputShape.assign(higherRank, 1);
 
   int64_t higherRankDim;
   int64_t lowerRankDim;
+  const int64_t rankDiff = higherRank - lowerRank;
+
+  for (int64_t i = lowerRank - 1; i >= 0; i--) {
+    higherRankDim = higherRankShape[i + rankDiff];
+    lowerRankDim = lowerRankShape[i];
 
-  for (int64_t i = higherRank - 1, j = lowerRank - 1; i >= 0 && j >= 0;
-       i--, j--) {
-    higherRankDim = higherRankShape[i];
-    lowerRankDim = lowerRankShape[j];
-
-    if (lowerRankDim == 1 && higherRankDim > 1)
-      reshapeOutputShape[i] = 1;
-    else if ((lowerRankDim > 1 && higherRankDim == 1) ||
-             (lowerRankDim == higherRankDim))
-      reshapeOutputShape[i] = lowerRankDim;
-    else if (higherRankDim != lowerRankDim)
+    if (lowerRankDim != 1 && higherRankDim != 1 &&
+        lowerRankDim != higherRankDim)
       return failure();
+
+    reshapeOutputShape[i + rankDiff] = lowerRankDim == 1 ? 1 : lowerRankDim;
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index e3127c6..78c2425 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -8,13 +8,8 @@
 
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/LogicalResult.h"
-#include <cassert>
 
 namespace mlir {
 namespace xegpu {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index fc9498a..3bdf3fb 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -10,13 +10,9 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/TypeUtilities.h"
-#include "mlir/Support/LLVM.h"
 
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/LogicalResult.h"
 
 #define DEBUG_TYPE "xegpu"
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6883d78..f438792 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -161,6 +161,10 @@ static LogicalResult checkImplementationStatus(Operation &op) {
     if (op.getDevice())
       result = todo("device");
   };
+  auto checkDistSchedule = [&todo](auto op, LogicalResult &result) {
+    if (op.getDistScheduleChunkSize())
+      result = todo("dist_schedule with chunk_size");
+  };
   auto checkHasDeviceAddr = [&todo](auto op, LogicalResult &result) {
     if (!op.getHasDeviceAddrVars().empty())
       result = todo("has_device_addr");
@@ -169,15 +173,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
     if (op.getHint())
       op.emitWarning("hint clause discarded");
   };
-  auto checkHostEval = [](auto op, LogicalResult &result) {
-    // Host evaluated clauses are supported, except for loop bounds.
-    for (BlockArgument arg :
-         cast<omp::BlockArgOpenMPOpInterface>(*op).getHostEvalBlockArgs())
-      for (Operation *user : arg.getUsers())
-        if (isa<omp::LoopNestOp>(user))
-          result = op.emitError("not yet implemented: host evaluation of loop "
-                                "bounds in omp.target operation");
-  };
   auto checkInReduction = [&todo](auto op, LogicalResult &result) {
     if (!op.getInReductionVars().empty() || op.getInReductionByref() ||
         op.getInReductionSyms())
@@ -252,6 +247,12 @@ static LogicalResult checkImplementationStatus(Operation &op) {
 
   LogicalResult result = success();
   llvm::TypeSwitch<Operation &>(op)
+      .Case([&](omp::DistributeOp op) {
+        checkAllocate(op, result);
+        checkDistSchedule(op, result);
+        checkOrder(op, result);
+        checkPrivate(op, result);
+      })
       .Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); })
       .Case([&](omp::SectionsOp op) {
         checkAllocate(op, result);
@@ -308,7 +309,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkBare(op, result);
         checkDevice(op, result);
         checkHasDeviceAddr(op, result);
-        checkHostEval(op, result);
         checkInReduction(op, result);
         checkIsDevicePtr(op, result);
         checkPrivate(op, result);
@@ -1976,6 +1976,14 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   bool isSimd = wsloopOp.getScheduleSimd();
   bool loopNeedsBarrier = !wsloopOp.getNowait();
 
+  // The only legal way for the direct parent to be omp.distribute is that this
+  // represents 'distribute parallel do'. Otherwise, this is a regular
+  // worksharing loop.
+  llvm::omp::WorksharingLoopType workshareLoopType =
+      llvm::isa_and_present<omp::DistributeOp>(opInst.getParentOp())
+          ? llvm::omp::WorksharingLoopType::DistributeForStaticLoop
+          : llvm::omp::WorksharingLoopType::ForStaticLoop;
+
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::Expected<llvm::BasicBlock *> regionBlock = convertOmpOpRegions(
       wsloopOp.getRegion(), "omp.wsloop.region", builder, moduleTranslation);
@@ -1991,7 +1999,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
           ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
           convertToScheduleKind(schedule), chunk, isSimd,
           scheduleMod == omp::ScheduleModifier::monotonic,
-          scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered);
+          scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
+          workshareLoopType);
 
   if (failed(handleError(wsloopIP, opInst)))
     return failure();
@@ -3854,6 +3863,78 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   return success();
 }
 
+static LogicalResult
+convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
+                     LLVM::ModuleTranslation &moduleTranslation) {
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  auto distributeOp = cast<omp::DistributeOp>(opInst);
+  if (failed(checkImplementationStatus(opInst)))
+    return failure();
+
+  using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  auto bodyGenCB = [&](InsertPointTy allocaIP,
+                       InsertPointTy codeGenIP) -> llvm::Error {
+    // Save the alloca insertion point on ModuleTranslation stack for use in
+    // nested regions.
+    LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
+        moduleTranslation, allocaIP);
+
+    // DistributeOp has only one region associated with it.
+    builder.restoreIP(codeGenIP);
+
+    llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+    llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
+    llvm::Expected<llvm::BasicBlock *> regionBlock =
+        convertOmpOpRegions(distributeOp.getRegion(), "omp.distribute.region",
+                            builder, moduleTranslation);
+    if (!regionBlock)
+      return regionBlock.takeError();
+    builder.SetInsertPoint(*regionBlock, (*regionBlock)->begin());
+
+    // Skip applying a workshare loop below when translating 'distribute
+    // parallel do' (it's been already handled by this point while translating
+    // the nested omp.wsloop).
+    if (isa_and_present<omp::WsloopOp>(distributeOp.getNestedWrapper()))
+      return llvm::Error::success();
+
+    // TODO: Add support for clauses which are valid for DISTRIBUTE constructs.
+    // Static schedule is the default.
+    auto schedule = omp::ClauseScheduleKind::Static;
+    bool isOrdered = false;
+    std::optional<omp::ScheduleModifier> scheduleMod;
+    bool isSimd = false;
+    llvm::omp::WorksharingLoopType workshareLoopType =
+        llvm::omp::WorksharingLoopType::DistributeStaticLoop;
+    bool loopNeedsBarrier = false;
+    llvm::Value *chunk = nullptr;
+
+    llvm::CanonicalLoopInfo *loopInfo = findCurrentLoopInfo(moduleTranslation);
+    llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP =
+        ompBuilder->applyWorkshareLoop(
+            ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
+            convertToScheduleKind(schedule), chunk, isSimd,
+            scheduleMod == omp::ScheduleModifier::monotonic,
+            scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
+            workshareLoopType);
+
+    if (!wsloopIP)
+      return wsloopIP.takeError();
+    return llvm::Error::success();
+  };
+
+  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
+      findAllocaInsertPoint(builder, moduleTranslation);
+  llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
+  llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
+      ompBuilder->createDistribute(ompLoc, allocaIP, bodyGenCB);
+
+  if (failed(handleError(afterIP, opInst)))
+    return failure();
+
+  builder.restoreIP(*afterIP);
+  return success();
+}
+
 /// Lowers the FlagsAttr which is applied to the module on the device
 /// pass when offloading, this attribute contains OpenMP RTL globals that can
 /// be passed as flags to the frontend, otherwise they are set to default
@@ -4067,9 +4148,13 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
 ///
 /// Loop bounds and steps are only optionally populated, if output vectors are
 /// provided.
-static void extractHostEvalClauses(omp::TargetOp targetOp, Value &numThreads,
-                                   Value &numTeamsLower, Value &numTeamsUpper,
-                                   Value &threadLimit) {
+static void
+extractHostEvalClauses(omp::TargetOp targetOp, Value &numThreads,
+                       Value &numTeamsLower, Value &numTeamsUpper,
+                       Value &threadLimit,
+                       llvm::SmallVectorImpl<Value> *lowerBounds = nullptr,
+                       llvm::SmallVectorImpl<Value> *upperBounds = nullptr,
+                       llvm::SmallVectorImpl<Value> *steps = nullptr) {
   auto blockArgIface = llvm::cast<omp::BlockArgOpenMPOpInterface>(*targetOp);
   for (auto item : llvm::zip_equal(targetOp.getHostEvalVars(),
                                    blockArgIface.getHostEvalBlockArgs())) {
@@ -4094,11 +4179,26 @@ static void extractHostEvalClauses(omp::TargetOp targetOp, Value &numThreads,
               llvm_unreachable("unsupported host_eval use");
           })
           .Case([&](omp::LoopNestOp loopOp) {
-            // TODO: Extract bounds and step values. Currently, this cannot be
-            // reached because translation would have been stopped earlier as a
-            // result of `checkImplementationStatus` detecting and reporting
-            // this situation.
-            llvm_unreachable("unsupported host_eval use");
+            auto processBounds =
+                [&](OperandRange opBounds,
+                    llvm::SmallVectorImpl<Value> *outBounds) -> bool {
+              bool found = false;
+              for (auto [i, lb] : llvm::enumerate(opBounds)) {
+                if (lb == blockArg) {
+                  found = true;
+                  if (outBounds)
+                    (*outBounds)[i] = hostEvalVar;
+                }
+              }
+              return found;
+            };
+            bool found =
+                processBounds(loopOp.getLoopLowerBounds(), lowerBounds);
+            found = processBounds(loopOp.getLoopUpperBounds(), upperBounds) ||
+                    found;
+            found = processBounds(loopOp.getLoopSteps(), steps) || found;
+            (void)found;
+            assert(found && "unsupported host_eval use");
           })
           .Default([](Operation *) {
             llvm_unreachable("unsupported host_eval use");
@@ -4235,6 +4335,7 @@ initTargetDefaultAttrs(omp::TargetOp targetOp,
     combinedMaxThreadsVal = maxThreadsVal;
 
   // Update kernel bounds structure for the `OpenMPIRBuilder` to use.
+  attrs.ExecFlags = targetOp.getKernelExecFlags();
   attrs.MinTeams = minTeamsVal;
   attrs.MaxTeams.front() = maxTeamsVal;
   attrs.MinThreads = 1;
@@ -4252,9 +4353,15 @@ initTargetRuntimeAttrs(llvm::IRBuilderBase &builder,
                        LLVM::ModuleTranslation &moduleTranslation,
                        omp::TargetOp targetOp,
                        llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs &attrs) {
+  omp::LoopNestOp loopOp = castOrGetParentOfType<omp::LoopNestOp>(
+      targetOp.getInnermostCapturedOmpOp());
+  unsigned numLoops = loopOp ? loopOp.getNumLoops() : 0;
+
   Value numThreads, numTeamsLower, numTeamsUpper, teamsThreadLimit;
+  llvm::SmallVector<Value> lowerBounds(numLoops), upperBounds(numLoops),
+      steps(numLoops);
   extractHostEvalClauses(targetOp, numThreads, numTeamsLower, numTeamsUpper,
-                         teamsThreadLimit);
+                         teamsThreadLimit, &lowerBounds, &upperBounds, &steps);
 
   // TODO: Handle constant 'if' clauses.
   if (Value targetThreadLimit = targetOp.getThreadLimit())
@@ -4274,7 +4381,34 @@ initTargetRuntimeAttrs(llvm::IRBuilderBase &builder,
   if (numThreads)
     attrs.MaxThreads = moduleTranslation.lookupValue(numThreads);
 
-  // TODO: Populate attrs.LoopTripCount if it is target SPMD.
+  if (targetOp.getKernelExecFlags() != llvm::omp::OMP_TGT_EXEC_MODE_GENERIC) {
+    llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+    attrs.LoopTripCount = nullptr;
+
+    // To calculate the trip count, we multiply together the trip counts of
+    // every collapsed canonical loop. We don't need to create the loop nests
+    // here, since we're only interested in the trip count.
+    for (auto [loopLower, loopUpper, loopStep] :
+         llvm::zip_equal(lowerBounds, upperBounds, steps)) {
+      llvm::Value *lowerBound = moduleTranslation.lookupValue(loopLower);
+      llvm::Value *upperBound = moduleTranslation.lookupValue(loopUpper);
+      llvm::Value *step = moduleTranslation.lookupValue(loopStep);
+
+      llvm::OpenMPIRBuilder::LocationDescription loc(builder);
+      llvm::Value *tripCount = ompBuilder->calculateCanonicalLoopTripCount(
+          loc, lowerBound, upperBound, step, /*IsSigned=*/true,
+          loopOp.getLoopInclusive());
+
+      if (!attrs.LoopTripCount) {
+        attrs.LoopTripCount = tripCount;
+        continue;
+      }
+
+      // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
+      attrs.LoopTripCount = builder.CreateMul(attrs.LoopTripCount, tripCount,
+                                              {}, /*HasNUW=*/true);
+    }
+  }
 }
 
 static LogicalResult
@@ -4813,6 +4947,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
           .Case([&](omp::TargetOp) {
             return convertOmpTarget(*op, builder, moduleTranslation);
           })
+          .Case([&](omp::DistributeOp) {
+            return convertOmpDistribute(*op, builder, moduleTranslation);
+          })
           .Case([&](omp::LoopNestOp) {
             return convertOmpLoopNest(*op, builder, moduleTranslation);
           })
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index a524359..1f096ce1 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -463,7 +463,6 @@ func.func @conv2d_scalar_bias_f32(%input: tensor<1x49x42x27xf32>, %weights: tens
 
 // CHECK-LABEL: @conv2d_i8
 func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi8>, %bias: tensor<28xi8>) -> () {
-  // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32>
   // HWCF: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x1x1x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<1x1x27x28xi8>) permutation = [1, 2, 3, 0]
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xi32>
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xi8>) outs(%[[INIT]] : tensor<1x45x40x28xi32>) {
@@ -485,7 +484,6 @@ func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi
 
 // CHECK-LABEL: @conv2d_f32
 func.func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
-  // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32>
   // HWCF: %[[TRANSPOSE:.+]] =  linalg.transpose ins(%arg1 : tensor<28x3x3x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x3x27x28xf32>) permutation = [1, 2, 3, 0]
 
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xf32>
@@ -786,7 +784,6 @@ func.func @depthwise_conv2d_dyn_w_h(%arg0: tensor<2x?x?x3xf32>, %arg1: tensor<3x
 
 // CHECK-LABEL: @conv3d_f32
 func.func @conv3d_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<28x3x4x5x27xf32>, %bias: tensor<28xf32>) -> () {
-  // CHECK-DAG:  %[[PERMS:.+]] = arith.constant dense<[1, 2, 3, 4, 0]>
   // CHECK-DAG:  %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x4x5x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x28xf32>) permutation = [1, 2, 3, 4, 0]
   // CHECK-DAG:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xf32>
   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
@@ -824,7 +821,6 @@ func.func @conv3d_scalar_bias_f32(%input: tensor<1x49x48x47x27xf32>, %weights: t
 
 // CHECK-LABEL: @conv3d_i8
 func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5x27xi8>, %bias: tensor<28xi32>) -> () {
-  // CHECK-DAG:  %[[PERMS:.+]] = arith.constant dense<[1, 2, 3, 4, 0]>
   // CHECK-DAG:  %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x4x5x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x28xi8>) permutation = [1, 2, 3, 4, 0]
   // CHECK-DAG:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xi32>
   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
@@ -852,10 +848,9 @@ func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5
 // CHECK-LABEL: @test_transpose
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<1x2x3xi32>)
 func.func @test_transpose(%arg0: tensor<1x2x3xi32>) -> () {
-  %0 = arith.constant dense<[1, 2, 0]> : tensor<3xi32>
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<2x3x1xi32>
   // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<1x2x3xi32>) outs(%[[INIT]] : tensor<2x3x1xi32>) permutation = [1, 2, 0]
-  %1 = tosa.transpose %arg0, %0 : (tensor<1x2x3xi32>, tensor<3xi32>)  -> tensor<2x3x1xi32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 1, 2, 0>}: (tensor<1x2x3xi32>)  -> tensor<2x3x1xi32>
   return
 }
 
@@ -864,12 +859,11 @@ func.func @test_transpose(%arg0: tensor<1x2x3xi32>) -> () {
 // CHECK-LABEL: @test_transpose_dyn
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<1x?x3x4xi32>)
 func.func @test_transpose_dyn(%arg0: tensor<1x?x3x4xi32>) -> () {
-  %0 = arith.constant dense<[1, 3, 0, 2]> : tensor<4xi32>
   // CHECK: %[[C1:.+]] = arith.constant 1
   // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C1]]
   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM]]) : tensor<?x4x1x3xi32>
   // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<1x?x3x4xi32>) outs(%[[INIT]] : tensor<?x4x1x3xi32>) permutation = [1, 3, 0, 2]
-  %1 = tosa.transpose %arg0, %0 : (tensor<1x?x3x4xi32>, tensor<4xi32>)  -> tensor<?x4x1x3xi32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 1, 3, 0, 2>}: (tensor<1x?x3x4xi32>)  -> tensor<?x4x1x3xi32>
   return
 }
 
@@ -878,14 +872,13 @@ func.func @test_transpose_dyn(%arg0: tensor<1x?x3x4xi32>) -> () {
 // CHECK-LABEL: @test_transpose_dyn_multiple_2d
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>)
 func.func @test_transpose_dyn_multiple_2d(%arg0: tensor<?x?xf32>) -> () {
-  %0 = arith.constant dense<[1, 0]> : tensor<2xi32>
   // CHECK-DAG: %[[C0:.+]] = arith.constant 0
   // CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
   // CHECK-DAG: %[[C1:.+]] = arith.constant 1
   // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM1]], %[[DIM0]])
   // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<?x?xf32>) outs(%[[INIT]] : tensor<?x?xf32>) permutation = [1, 0]
-  %1 = tosa.transpose %arg0, %0 : (tensor<?x?xf32>, tensor<2xi32>)  -> tensor<?x?xf32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 1, 0>}: (tensor<?x?xf32>)  -> tensor<?x?xf32>
   return
 }
 
@@ -894,7 +887,6 @@ func.func @test_transpose_dyn_multiple_2d(%arg0: tensor<?x?xf32>) -> () {
 // CHECK-LABEL: @test_transpose_dyn_multiple_3d
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?x?xf32>)
 func.func @test_transpose_dyn_multiple_3d(%arg0: tensor<?x?x?xf32>) {
-  %0 = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   // CHECK-DAG: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
   // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
@@ -903,6 +895,30 @@ func.func @test_transpose_dyn_multiple_3d(%arg0: tensor<?x?x?xf32>) {
   // CHECK-DAG: %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
   // CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM2]], %[[DIM0]], %[[DIM1]]) : tensor<?x?x?xf32>
   // CHECK: %[[TRANSPOSE:.*]] = linalg.transpose ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[INIT]] : tensor<?x?x?xf32>) permutation = [2, 0, 1]
-  %1 = "tosa.transpose"(%arg0, %0) : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
+  %1 = "tosa.transpose"(%arg0) {perms = array<i32: 2, 0, 1>} : (tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
   return
 }
+
+// -----
+
+// CHECK-LABEL: @max_pool2d_nan_propagate
+func.func @max_pool2d_nan_propagate(%arg0: tensor<1x6x34x62xf32>) -> (tensor<1x4x32x62xf32>) {
+  // CHECK: linalg.pooling_nhwc_max
+  // CHECK-NOT: linalg.generic
+  %0 = tosa.max_pool2d %arg0 {pad = array<i64: 0, 0, 0, 0>, kernel = array<i64: 3, 3>, stride = array<i64: 1, 1>, nan_mode = "PROPAGATE"} : (tensor<1x6x34x62xf32>) -> tensor<1x4x32x62xf32>
+  return %0 : tensor<1x4x32x62xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @max_pool2d_nan_ignore
+func.func @max_pool2d_nan_ignore(%arg0: tensor<1x6x34x62xf32>) -> (tensor<1x4x32x62xf32>) {
+  // CHECK-NOT: linalg.pooling_nhwc_max
+  // CHECK: linalg.generic
+  // CHECK: arith.maximumf
+  // CHECK: arith.cmpf uno
+  // CHECK: arith.select
+  // CHECK: linalg.yield
+  %0 = tosa.max_pool2d %arg0 {pad = array<i64: 0, 0, 0, 0>, kernel = array<i64: 3, 3>, stride = array<i64: 1, 1>, nan_mode = "IGNORE"} : (tensor<1x6x34x62xf32>) -> tensor<1x4x32x62xf32>
+  return %0: tensor<1x4x32x62xf32>
+}
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 17add2d..86e6f9e 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -1992,3 +1992,195 @@ func.func @test_cast_fp32_i64(%arg0: tensor<1xf32>) -> (tensor<1xi64>) {
   %0 = tosa.cast %arg0 : (tensor<1xf32>) -> tensor<1xi64>
   return %0: tensor<1xi64>
 }
+
+// -----
+
+// CHECK-LABEL: @reduce_min_nan_propagate
+func.func @reduce_min_nan_propagate(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.reduce
+  // CHECK: arith.minimumf
+  // CHECK-NOT: arith.cmpf uno
+  // CHECK-NOT: arith.select
+  // CHECK: linalg.yield
+  // CHECK-NOT: arith.constant 0x7FC00000
+  // CHECK-NOT: tensor.empty()
+  // CHECK-NOT: linalg.fill
+  // CHECK-NOT: tensor.empty()
+  // CHECK-NOT: select
+  // CHECK: return
+  %3 = tosa.reduce_min %arg0 {axis = 0 : i32, nan_mode = "PROPAGATE"} : (tensor<5x4xf32>) -> tensor<1x4xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @reduce_max_nan_propagate
+func.func @reduce_max_nan_propagate(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.reduce
+  // CHECK: arith.maximumf
+  // CHECK-NOT: arith.cmpf uno
+  // CHECK-NOT: arith.select
+  // CHECK: linalg.yield
+  // CHECK-NOT: arith.constant 0x7FC00000
+  // CHECK-NOT: tensor.empty()
+  // CHECK-NOT: linalg.fill
+  // CHECK-NOT: tensor.empty()
+  // CHECK-NOT: select
+  // CHECK: return
+  %4 = tosa.reduce_max %arg0 {axis = 0 : i32, nan_mode = "PROPAGATE"} : (tensor<5x4xf32>) -> tensor<1x4xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @reduce_min_nan_ignore
+func.func @reduce_min_nan_ignore(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.reduce
+  // CHECK: arith.minimumf
+  // CHECK: arith.cmpf uno
+  // CHECK: arith.select
+  // CHECK: linalg.yield
+  // CHECK: arith.constant 0x7FC00000
+  // CHECK: tensor.empty()
+  // CHECK: linalg.fill
+  // CHECK: tensor.empty()
+  // CHECK: select
+  %5 = tosa.reduce_min %arg0 {axis = 0 : i32, nan_mode = "IGNORE"} : (tensor<5x4xf32>) -> tensor<1x4xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @reduce_max_nan_ignore
+func.func @reduce_max_nan_ignore(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.reduce
+  // CHECK: arith.maximumf
+  // CHECK: arith.cmpf uno
+  // CHECK: arith.select
+  // CHECK: linalg.yield
+  // CHECK: arith.constant 0x7FC00000
+  // CHECK: tensor.empty()
+  // CHECK: linalg.fill
+  // CHECK: tensor.empty()
+  // CHECK: select
+  %6 = tosa.reduce_max %arg0 {axis = 0 : i32, nan_mode = "IGNORE"} : (tensor<5x4xf32>) -> tensor<1x4xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @minimum_nan_propagate
+func.func @minimum_nan_propagate(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.generic
+  // CHECK: arith.minimumf
+  // CHECK-NOT: arith.cmpf uno
+  // CHECK-NOT: arith.select
+  // CHECK: linalg.yield
+  %7 = tosa.minimum %arg0, %arg1 {nan_mode = "PROPAGATE"} : (tensor<5x4xf32>, tensor<5x4xf32>) -> tensor<5x4xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @maximum_nan_propagate
+func.func @maximum_nan_propagate(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.generic
+  // CHECK: arith.maximumf
+  // CHECK-NOT: arith.cmpf uno
+  // CHECK-NOT: arith.select
+  // CHECK: linalg.yield
+  %8 = tosa.maximum %arg0, %arg1 {nan_mode = "PROPAGATE"} : (tensor<5x4xf32>, tensor<5x4xf32>) -> tensor<5x4xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @minimum_nan_ignore
+func.func @minimum_nan_ignore(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.generic
+  // CHECK: arith.minimumf
+  // CHECK: arith.cmpf uno
+  // CHECK: arith.cmpf uno
+  // CHECK: arith.select
+  // CHECK: arith.select
+  // CHECK: linalg.yield
+  %9 = tosa.minimum %arg0, %arg1 {nan_mode = "IGNORE"} : (tensor<5x4xf32>, tensor<5x4xf32>) -> tensor<5x4xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @maximum_nan_ignore
+func.func @maximum_nan_ignore(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.generic
+  // CHECK: arith.maximumf
+  // CHECK: arith.cmpf uno
+  // CHECK: arith.cmpf uno
+  // CHECK: arith.select
+  // CHECK: arith.select
+  // CHECK: linalg.yield
+  %10 = tosa.maximum %arg0, %arg1 {nan_mode = "IGNORE"} : (tensor<5x4xf32>, tensor<5x4xf32>) -> tensor<5x4xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @argmax_nan_propagate
+func.func @argmax_nan_propagate(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.generic
+  // CHECK: arith.cmpf ogt
+  // CHECK: arith.select
+  // CHECK: arith.select
+  // CHECK-NOT: arith.cmpf uno
+  // CHECK-NOT: arith.cmpf uno
+  // CHECK-NOT: arith.select
+  // CHECK-NOT: arith.select
+  // CHECK: linalg.yield
+  %11 = tosa.argmax %arg0 {axis = 0 : i32, nan_mode = "PROPAGATE"} : (tensor<5x4xf32>)  -> tensor<4xi32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @argmax_nan_ignore
+func.func @argmax_nan_ignore(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.generic
+  // CHECK: arith.cmpf ogt
+  // CHECK: arith.select
+  // CHECK: arith.select
+  // CHECK: arith.cmpf uno
+  // CHECK: arith.select
+  // CHECK: arith.select
+  // CHECK: linalg.yield
+  %12 = tosa.argmax %arg0 {axis = 0 : i32, nan_mode = "IGNORE"} : (tensor<5x4xf32>)  -> tensor<4xi32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @clamp_nan_propagate
+func.func @clamp_nan_propagate(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.generic
+  // CHECK: arith.minimumf
+  // CHECK: arith.maximumf
+  // CHECK-NOT: arith.cmpf uno
+  // CHECK-NOT: arith.select
+  // CHECK: linalg.yield
+  %13 = tosa.clamp %arg0 {min_val =  1.0 : f32, max_val = 5.0 : f32, nan_mode = "PROPAGATE"} : (tensor<5x4xf32>) -> tensor<5x4xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @clamp_nan_ignore
+func.func @clamp_nan_ignore(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
+  // CHECK: linalg.generic
+  // CHECK: arith.minimumf
+  // CHECK: arith.maximumf
+  // CHECK: arith.cmpf uno
+  // CHECK: arith.select
+  // CHECK: linalg.yield
+  %14 = tosa.clamp %arg0 {min_val = 1.0 : f32, max_val = 5.0 : f32, nan_mode = "IGNORE"} : (tensor<5x4xf32>) -> tensor<5x4xf32>
+
+  return
+}
diff --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
index cf96a30..b5b951a 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-4.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{compute-tolerance=0.0}))' -split-input-file | FileCheck %s --check-prefix=ZERO-TOLERANCE
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer maximal}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER-MAXIMAL
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{maximal mode=sibling}))' -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL
 // All fusion: producer-consumer and sibling.
@@ -544,3 +545,80 @@ func.func @sibling_reduction(%input : memref<10xf32>, %output : memref<10xf32>,
   // SIBLING-MAXIMAL-NEXT:   affine.store
   return
 }
+
+// -----
+
+// From  https://github.com/llvm/llvm-project/issues/54541
+
+#map = affine_map<(d0) -> (d0 mod 65536)>
+// ZERO-TOLERANCE-LABEL: func @zero_tolerance
+func.func @zero_tolerance(%arg0: memref<65536xcomplex<f64>>, %arg1: memref<30x131072xi64>,
+%3 : memref<30xi64>,
+%4 : memref<30xi64>,
+%5 : memref<30xi64>,
+%6 : memref<30xi64>
+) {
+  %c65536 = arith.constant 65536 : index
+  %cst = arith.constant 0.000000e+00 : f64
+  %cst_0 = arith.constant 0x4320000000380004 : f64
+  %cst_1 = arith.constant 5.000000e-01 : f64
+  %0 = memref.alloc() {alignment = 128 : i64} : memref<30x131072xi64>
+  %1 = memref.alloc() {alignment = 128 : i64} : memref<131072xi1>
+  %2 = memref.alloc() {alignment = 128 : i64} : memref<131072xi128>
+  // This nest nest shouldn't be fused in when a zero tolerance is specified.
+  // ZERO-TOLERANCE: affine.for %{{.*}} = 0 to 131072
+  affine.for %arg2 = 0 to 131072 {
+    %7 = affine.apply #map(%arg2)
+    %8 = affine.load %arg0[%7] : memref<65536xcomplex<f64>>
+    %9 = arith.cmpi ult, %arg2, %c65536 : index
+    %10 = complex.im %8 : complex<f64>
+    %11 = complex.re %8 : complex<f64>
+    %12 = arith.select %9, %11, %10 : f64
+    %13 = arith.cmpf olt, %12, %cst : f64
+    %14 = arith.negf %12 : f64
+    %15 = arith.select %13, %14, %12 : f64
+    %16 = arith.mulf %15, %cst_0 : f64
+    %17 = arith.addf %16, %cst_1 : f64
+    %18 = arith.fptosi %17 : f64 to i128
+    affine.store %18, %2[%arg2] : memref<131072xi128>
+    affine.store %13, %1[%arg2] : memref<131072xi1>
+  }
+  // The next two nests are fused.
+  // ZERO-TOLERANCE:      affine.for %{{.*}} = 0 to 30
+  // ZERO-TOLERANCE-NEXT:   affine.for %{{.*}} = 0 to 131072
+  // ZERO-TOLERANCE:          func.call @__external_reduce_barrett
+  // ZERO-TOLERANCE:          affine.store
+  // ZERO-TOLERANCE:          affine.load
+  // ZERO-TOLERANCE-NEXT:     affine.store
+  affine.for %arg2 = 0 to 30 {
+    affine.for %arg3 = 0 to 131072 {
+      %7 = affine.load %6[%arg2] : memref<30xi64>
+      %8 = affine.load %3[%arg2] : memref<30xi64>
+      %9 = affine.load %5[%arg2] : memref<30xi64>
+      %10 = affine.load %4[%arg2] : memref<30xi64>
+      %11 = affine.load %2[%arg3] : memref<131072xi128>
+      %12 = affine.load %1[%arg3] : memref<131072xi1>
+      %13 = func.call @__external_reduce_barrett(%7, %8, %9, %10, %11) {outputModFac = 1 : i64} : (i64, i64, i64, i64, i128) -> i64
+      %14 = arith.subi %7, %13 : i64
+      %15 = arith.select %12, %14, %13 : i64
+      affine.store %15, %0[%arg2, %arg3] : memref<30x131072xi64>
+    }
+  }
+  func.call @__external_levelwise_forward_ntt(%0) : (memref<30x131072xi64>) -> ()
+  affine.for %arg2 = 0 to 30 {
+    affine.for %arg3 = 0 to 131072 {
+      %7 = affine.load %0[%arg2, %arg3] : memref<30x131072xi64>
+      affine.store %7, %arg1[%arg2, %arg3] : memref<30x131072xi64>
+    }
+  }
+  // Under maximal fusion, just one nest.
+  // PRODUCER-CONSUMER-MAXIMAL:      affine.for %{{.*}} = 0 to 30
+  // PRODUCER-CONSUMER-MAXIMAL-NEXT:   affine.for %{{.*}} = 0 to 131072
+  // PRODUCER-CONSUMER-MAXIMAL-NOT:  affine.for %{{.*}}
+  memref.dealloc %2 : memref<131072xi128>
+  memref.dealloc %1 : memref<131072xi1>
+  memref.dealloc %0 : memref<30x131072xi64>
+  return
+}
+func.func private @__external_levelwise_forward_ntt(memref<30x131072xi64>)
+func.func private @__external_reduce_barrett(i64, i64, i64, i64, i128) -> i64
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
index ec2fb58..e7797d4 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
@@ -796,3 +796,17 @@ func.func @result_type_mismatch(%c: i1) -> tensor<5xf32> {
   return %1 : tensor<5xf32>
 }
 
+
+// -----
+
+// CHECK-LABEL: @outer_func({{.+}}: memref<
+func.func @outer_func(%t: tensor<5xf32>) -> tensor<5xf32> {
+  return %t : tensor<5xf32>
+}
+
+module @inner_module {
+  // CHECK: @inner_func({{.+}}: tensor<5xf32> {bufferization.writable = false})
+  func.func @inner_func(%t: tensor<5xf32> {bufferization.writable = false}) -> tensor<5xf32> {
+    return %t : tensor<5xf32>
+  }
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
index 3c50a9e..a2741ab 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
@@ -111,23 +111,21 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-module {
-  // CHECK-LABEL: func @test_function(
-  //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
-  func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
-    %c0 = arith.constant 0 : index
-
-    // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]]
-    // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]]
-    // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
-    // CHECK: memref.copy %[[A_memref]], %[[alloc]]
-    // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
-    // CHECK: %[[res_tensor:.*]] = bufferization.to_tensor %[[alloc]]
-    %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
-
-    // CHECK: return %[[res_tensor]]
-    return %0 : tensor<?xf32>
-  }
+// CHECK-LABEL: func @test_function(
+//  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
+func.func @test_function(%A : tensor<?xf32>, %v : vector<4xf32>) -> (tensor<?xf32>) {
+  %c0 = arith.constant 0 : index
+
+  // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]]
+  // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]]
+  // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
+  // CHECK: memref.copy %[[A_memref]], %[[alloc]]
+  // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
+  // CHECK: %[[res_tensor:.*]] = bufferization.to_tensor %[[alloc]]
+  %0 = vector.transfer_write %v, %A[%c0] : vector<4xf32>, tensor<?xf32>
+
+  // CHECK: return %[[res_tensor]]
+  return %0 : tensor<?xf32>
 }
 
 // -----
@@ -222,8 +220,8 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %alloc_tensor = transform.structured.match ops{["bufferization.alloc_tensor"]} in %arg1
       : (!transform.any_op) -> !transform.op<"bufferization.alloc_tensor">
-    %2, %new = transform.structured.bufferize_to_allocation %alloc_tensor 
-      {alloc_op = "memref.alloca"} 
+    %2, %new = transform.structured.bufferize_to_allocation %alloc_tensor
+      {alloc_op = "memref.alloca"}
         : !transform.op<"bufferization.alloc_tensor">
     transform.yield
   }
diff --git a/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir b/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
index ef8b80f..e354eb9 100644
--- a/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
+++ b/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
@@ -34,8 +34,7 @@ func.func @dim_out_of_bounds_2(%idx1 : index, %idx2 : index) -> index {
 //  CHECK-NEXT:           tensor.dim %[[arg]], %[[c2]]
 //  CHECK-NEXT:           return
 func.func @dynamic_dim_of_transpose_op(%arg0: tensor<1x2x?x8xi8>) -> index {
-  %0 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-  %1 = tosa.transpose %arg0, %0 : (tensor<1x2x?x8xi8>, tensor<4xi32>) -> tensor<1x8x2x?xi8>
+  %1 = tosa.transpose %arg0 { perms = array<i32: 0, 3, 1, 2> }: (tensor<1x2x?x8xi8>) -> tensor<1x8x2x?xi8>
   %c3 = arith.constant 3 : index
   %dim = tensor.dim %1, %c3 : tensor<1x8x2x?xi8>
   return %dim : index
@@ -47,8 +46,7 @@ func.func @dynamic_dim_of_transpose_op(%arg0: tensor<1x2x?x8xi8>) -> index {
 //  CHECK:           arith.constant 100 : index
 //  CHECK:           return
 func.func @static_dim_of_transpose_op(%arg0: tensor<1x100x?x8xi8>) -> index {
-  %0 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-  %1 = tosa.transpose %arg0, %0 : (tensor<1x100x?x8xi8>, tensor<4xi32>) -> tensor<1x8x100x?xi8>
+  %1 = tosa.transpose %arg0 { perms = array<i32: 0, 3, 1, 2> }: (tensor<1x100x?x8xi8>) -> tensor<1x8x100x?xi8>
   %c2 = arith.constant 2 : index
   %dim = tensor.dim %1, %c2 : tensor<1x8x100x?xi8>
   return %dim : index
diff --git a/mlir/test/Dialect/SPIRV/IR/image-ops.mlir b/mlir/test/Dialect/SPIRV/IR/image-ops.mlir
index 1161f85..266b69f 100644
--- a/mlir/test/Dialect/SPIRV/IR/image-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/image-ops.mlir
@@ -1,20 +1,20 @@
-// RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt --split-input-file --verify-diagnostics %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.ImageDrefGather
 //===----------------------------------------------------------------------===//
 
 func.func @image_dref_gather(%arg0 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32) -> () {
-  // CHECK: spirv.ImageDrefGather {{.*}} : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, {{.*}} : vector<4xf32>, {{.*}} : f32 -> vector<4xi32>
-  %0 = spirv.ImageDrefGather %arg0 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32 -> vector<4xi32>
+  // CHECK: spirv.ImageDrefGather {{.*}}, {{.*}}, {{.*}} : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, vector<4xf32>, f32 -> vector<4xi32>
+  %0 = spirv.ImageDrefGather %arg0, %arg1, %arg2 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, vector<4xf32>, f32 -> vector<4xi32>
   spirv.Return
 }
 
 // -----
 
 func.func @image_dref_gather_with_single_imageoperands(%arg0 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32) -> () {
-  // CHECK: spirv.ImageDrefGather {{.*}} ["NonPrivateTexel"] -> vector<4xi32>
-  %0 = spirv.ImageDrefGather %arg0 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32 ["NonPrivateTexel"] -> vector<4xi32>
+  // CHECK: spirv.ImageDrefGather {{.*}}, {{.*}}, {{.*}} ["NonPrivateTexel"] : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, vector<4xf32>, f32 -> vector<4xi32>
+  %0 = spirv.ImageDrefGather %arg0, %arg1, %arg2 ["NonPrivateTexel"] : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, vector<4xf32>, f32 -> vector<4xi32>
   spirv.Return
 }
 
@@ -22,39 +22,39 @@ func.func @image_dref_gather_with_single_imageoperands(%arg0 : !spirv.sampled_im
 
 func.func @image_dref_gather_with_mismatch_imageoperands(%arg0 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32) -> () {
   // expected-error @+1 {{the Image Operands should encode what operands follow, as per Image Operands}}
-  %0 = spirv.ImageDrefGather %arg0 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32 (%arg2, %arg2 : f32, f32) -> vector<4xi32>
+  %0 = spirv.ImageDrefGather %arg0, %arg1, %arg2 (%arg2, %arg2) : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, vector<4xf32>, f32 (f32, f32) -> vector<4xi32>
   spirv.Return
 }
 
 // -----
 
 func.func @image_dref_gather_error_result_type(%arg0 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32) -> () {
-  // expected-error @+1 {{result type must be a vector of four components}}
-  %0 = spirv.ImageDrefGather %arg0 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32 -> vector<3xi32>
+  // expected-error @+1 {{must be vector of 8/16/32/64-bit integer values of length 4 or vector of 16/32/64-bit float values of length 4}}
+  %0 = spirv.ImageDrefGather %arg0, %arg1, %arg2 : !spirv.sampled_image<!spirv.image<i32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, vector<4xf32>, f32 -> vector<3xi32>
   spirv.Return
 }
 
 // -----
 
 func.func @image_dref_gather_error_same_type(%arg0 : !spirv.sampled_image<!spirv.image<i32, Rect, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32) -> () {
-  // expected-error @+1 {{the component type of result must be the same as sampled type of the underlying image type}}
-  %0 = spirv.ImageDrefGather %arg0 : !spirv.sampled_image<!spirv.image<i32, Rect, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32 -> vector<4xf32>
+  // expected-error @+1 {{the result component type must match the image sampled type}}
+  %0 = spirv.ImageDrefGather %arg0, %arg1, %arg2 : !spirv.sampled_image<!spirv.image<i32, Rect, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, vector<4xf32>, f32 -> vector<4xf32>
   spirv.Return
 }
 
 // -----
 
 func.func @image_dref_gather_error_dim(%arg0 : !spirv.sampled_image<!spirv.image<i32, Dim1D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32) -> () {
-  // expected-error @+1 {{the Dim operand of the underlying image type must be 2D, Cube, or Rect}}
-  %0 = spirv.ImageDrefGather %arg0 : !spirv.sampled_image<!spirv.image<i32, Dim1D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32 -> vector<4xi32>
+  // expected-error @+1 {{the Dim operand of the underlying image must be Dim2D or Cube or Rect}}
+  %0 = spirv.ImageDrefGather %arg0, %arg1, %arg2 : !spirv.sampled_image<!spirv.image<i32, Dim1D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>>, vector<4xf32>, f32 -> vector<4xi32>
   spirv.Return
 }
 
 // -----
 
 func.func @image_dref_gather_error_ms(%arg0 : !spirv.sampled_image<!spirv.image<i32, Cube, NoDepth, NonArrayed, MultiSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32) -> () {
-  // expected-error @+1 {{the MS operand of the underlying image type must be 0}}
-  %0 = spirv.ImageDrefGather %arg0 : !spirv.sampled_image<!spirv.image<i32, Cube, NoDepth, NonArrayed, MultiSampled, NoSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32 -> vector<4xi32>
+  // expected-error @+1 {{the MS operand of the underlying image type must be SingleSampled}}
+  %0 = spirv.ImageDrefGather %arg0, %arg1, %arg2 : !spirv.sampled_image<!spirv.image<i32, Cube, NoDepth, NonArrayed, MultiSampled, NoSampler, Unknown>>, vector<4xf32>, f32 -> vector<4xi32>
   spirv.Return
 }
 
@@ -121,24 +121,24 @@ func.func @image_query_size_error_result2(%arg0 : !spirv.image<f32, Buffer, NoDe
 //===----------------------------------------------------------------------===//
 
 func.func @image_write(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>) -> () {
-  // CHECK:  spirv.ImageWrite %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>
-  spirv.ImageWrite %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>
+  // CHECK:  spirv.ImageWrite {{%.*}}, {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, vector<2xsi32>, vector<4xf32>
+  spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, vector<2xsi32>, vector<4xf32>
   spirv.Return
 }
 
 // -----
 
 func.func @image_write_scalar_texel(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : f32) -> () {
-  // CHECK:  spirv.ImageWrite %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : f32
-  spirv.ImageWrite %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : f32
+  // CHECK:  spirv.ImageWrite {{%.*}}, {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, vector<2xsi32>, f32
+  spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, vector<2xsi32>, f32
   spirv.Return
 }
 
 // -----
 
 func.func @image_write_need_sampler(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>) -> () {
-  // expected-error @+1 {{the sampled operand of the underlying image must be 0 or 2}}
-  spirv.ImageWrite %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>
+  // expected-error @+1 {{the sampled operand of the underlying image must be SamplerUnknown or NoSampler}}
+  spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Rgba16>, vector<2xsi32>, vector<4xf32>
   spirv.Return
 }
 
@@ -146,7 +146,7 @@ func.func @image_write_need_sampler(%arg0 : !spirv.image<f32, Dim2D, NoDepth, No
 
 func.func @image_write_subpass_data(%arg0 : !spirv.image<f32, SubpassData, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>) -> () {
   // expected-error @+1 {{the Dim operand of the underlying image must not be SubpassData}}
-  spirv.ImageWrite %arg0 : !spirv.image<f32, SubpassData, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>
+  spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, SubpassData, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, vector<2xsi32>, vector<4xf32>
   spirv.Return
 }
 
@@ -154,6 +154,6 @@ func.func @image_write_subpass_data(%arg0 : !spirv.image<f32, SubpassData, NoDep
 
 func.func @image_write_texel_type_mismatch(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : vector<4xi32>) -> () {
   // expected-error @+1 {{the texel component type must match the image sampled type}}
-  spirv.ImageWrite %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, %arg1 : vector<2xsi32>, %arg2 : vector<4xi32>
+  spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba16>, vector<2xsi32>, vector<4xi32>
   spirv.Return
 }
diff --git a/mlir/test/Dialect/Tosa/availability.mlir b/mlir/test/Dialect/Tosa/availability.mlir
index e66ff4c..d9d1140 100644
--- a/mlir/test/Dialect/Tosa/availability.mlir
+++ b/mlir/test/Dialect/Tosa/availability.mlir
@@ -553,10 +553,9 @@ func.func @test_tile(%arg0: tensor<13x21x3xf32>) -> tensor<39x21x6xf32> {
 // -----
 // CHECK-LABEL: transpose
 func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
-  %0 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
   // CHECK: profiles: [ [pro_int, pro_fp] ]
   // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16] ]
-  %1 = tosa.transpose %arg0, %0 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21xf32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 2, 0, 1>}: (tensor<13x21x3xf32>) -> tensor<3x13x21xf32>
   return %1 : tensor<3x13x21xf32>
 }
 
@@ -629,8 +628,8 @@ func.func @test_identity(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> {
 // -----
 // CHECK-LABEL: cond_if
 func.func @test_cond_if(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> tensor<f32> {
-  // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: tosa.cond_if profiles: [ ]
+  // CHECK: tosa.cond_if extensions: [ [controlflow] ]
   %0 = tosa.cond_if %arg2 -> (tensor<f32>) {
     %1 = tosa.add %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<f32>
     tosa.yield %1 : tensor<f32>
@@ -645,8 +644,8 @@ func.func @test_cond_if(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1
 // CHECK-LABEL: while_loop
 func.func @test_while_loop(%arg0: tensor<10xi32>, %arg1: tensor<i32>) {
   %0 = "tosa.const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: profiles: [ ]
+  // CHECK: extensions: [ [controlflow] ]
   %1:3 = tosa.while_loop (%arg2 = %0, %arg3 = %0, %arg4 = %arg0) : (tensor<i32>, tensor<i32>, tensor<10xi32>) -> (tensor<i32>, tensor<i32>, tensor<10xi32>) {
     %2 = tosa.greater_equal %arg3, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
     %3 = tosa.logical_not %2 : (tensor<i1>) -> tensor<i1>
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index 0e177a0..ef1185e 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -631,12 +631,11 @@ func.func @reshape_canonicalize_quant_nofold() -> (tensor<1x3x!quant.uniform<i8:
 
 // CHECK-LABEL: @transpose_canonicalize_strip_quant
 func.func @transpose_canonicalize_strip_quant() -> (tensor<2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) {
-  // CHECK-DAG: tosa.const_shape {value = dense<[2, 1, 3]> : tensor<3xindex>} : () -> !tosa.shape<3>
-  // CHECK-DAG: "tosa.const"() <{value = dense<0> : tensor<1x2x3xi8>}> : () -> tensor<1x2x3x!quant.uniform<i8:f32, 1.000000e+00>>
-  // CHECK: tosa.reshape %0, %1 : (tensor<1x2x3x!quant.uniform<i8:f32, 1.000000e+00>>, !tosa.shape<3>) -> tensor<2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
-  %perms = "tosa.const"() {value = dense<[1, 0, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {value = dense<[2, 1, 3]> : tensor<3xindex>} : () -> !tosa.shape<3>
+  // CHECK-DAG: %[[CONST:.*]] = "tosa.const"() <{value = dense<0> : tensor<1x2x3xi8>}> : () -> tensor<1x2x3x!quant.uniform<i8:f32, 1.000000e+00>>
+  // CHECK: tosa.reshape %[[CONST]], %[[SHAPE]] : (tensor<1x2x3x!quant.uniform<i8:f32, 1.000000e+00>>, !tosa.shape<3>) -> tensor<2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
   %0 = "tosa.const"() {value = dense<0> : tensor<1x2x3xi8>} : ()-> tensor<1x2x3x!quant.uniform<i8:f32, 1.000000e+00>>
-  %1 = tosa.transpose %0, %perms : (tensor<1x2x3x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<3xi32>) -> tensor<2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+  %1 = tosa.transpose %0 { perms = array<i32: 1, 0, 2> }: (tensor<1x2x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
   return %1 :  tensor<2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 }
 
@@ -688,8 +687,7 @@ func.func @tile_nofold(%arg0: tensor<3x4xf32>) -> tensor<3x8xf32> {
 func.func @transpose_no_op(%arg0: tensor<3x4x5x6xf32>) -> tensor<3x4x5x6xf32> {
   // CHECK: return %arg0
   // CHECK-NOT: tosa.transpose
-  %perms = "tosa.const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %arg0, %perms : (tensor<3x4x5x6xf32>, tensor<4xi32>) -> tensor<3x4x5x6xf32>
+  %1 = tosa.transpose %arg0 { perms = array<i32: 0, 1, 2, 3> }: (tensor<3x4x5x6xf32>) -> tensor<3x4x5x6xf32>
   return %1 : tensor<3x4x5x6xf32>
 }
 
@@ -699,13 +697,22 @@ func.func @transpose_no_op(%arg0: tensor<3x4x5x6xf32>) -> tensor<3x4x5x6xf32> {
 func.func @transpose_is_reshape(%arg0: tensor<1x4x5x1xf32>) -> tensor<1x4x1x5xf32> {
   // CHECK: %[[CONST0:.+]] = tosa.const_shape {value = dense<[1, 4, 1, 5]> : tensor<4xindex>} : () -> !tosa.shape<4>
   // CHECK: tosa.reshape %arg0, %[[CONST0]]
-  %perms = "tosa.const"() <{value = dense<[3, 1, 0, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-  %0 = tosa.transpose %arg0, %perms : (tensor<1x4x5x1xf32>, tensor<4xi32>) -> tensor<1x4x1x5xf32>
+  %0 = tosa.transpose %arg0 { perms = array<i32: 3, 1, 0, 2> }: (tensor<1x4x5x1xf32>) -> tensor<1x4x1x5xf32>
   return %0 : tensor<1x4x1x5xf32>
 }
 
 // -----
 
+// CHECK-LABEL: @transpose_is_reshape_unknown_dim
+func.func @transpose_is_reshape_unknown_dim(%arg0: tensor<1x4x?x1xf32>) -> tensor<1x4x1x?xf32> {
+  // CHECK: %[[CONST0:.+]] = tosa.const_shape {value = dense<[1, 4, 1, -1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+  // CHECK: tosa.reshape %arg0, %[[CONST0]]
+  %0 = tosa.transpose %arg0 { perms = array<i32: 3, 1, 0, 2> }: (tensor<1x4x?x1xf32>) -> tensor<1x4x1x?xf32>
+  return %0 : tensor<1x4x1x?xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @single_bit_reshape
 // https://github.com/llvm/llvm-project/issues/55440
 func.func @single_bit_reshape() -> tensor<1xi1> {
@@ -1012,3 +1019,14 @@ func.func nested @do_not_fold_reciprocal_int() -> tensor<3x600x1200xi32> {
   %2 = "tosa.reciprocal"(%1): (tensor<3x600x1200xi32>) -> tensor<3x600x1200xi32>
   return %2 : tensor<3x600x1200xi32>
 }
+
+// -----
+
+// CHECK-LABEL: @do_not_fold_int_div_division_by_0
+func.func @do_not_fold_int_div_division_by_0() -> tensor<1x24x2xi32> {
+  // CHECK: tosa.int_div
+  %1 = "tosa.const"() <{value = dense<0> : tensor<1x24x2xi32>}> : () -> tensor<1x24x2xi32>
+  %4 = "tosa.const"() <{value = dense<20> : tensor<1x24x2xi32>}> : () -> tensor<1x24x2xi32>
+  %16 = tosa.int_div %4, %1 : (tensor<1x24x2xi32>, tensor<1x24x2xi32>) -> tensor<1x24x2xi32>
+  return %16 : tensor<1x24x2xi32>
+}
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index e6fb741..190aa77 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -20,34 +20,30 @@ func.func @argmax_dynamic_shape_no_fold_dim_size_1(%arg0: tensor<?x1x3xf32>) ->
 // CHECK-LABEL: @transpose_fold
 func.func @transpose_fold(%arg0: tensor<3x4xf32>) -> tensor<3x4xf32> {
   // CHECK: return %arg0
-  %0 = arith.constant dense<[0, 1]> : tensor<2xi32>
-  %1 = tosa.transpose %arg0, %0 { perms = [1, 0] }: (tensor<3x4xf32>, tensor<2xi32>) -> tensor<3x4xf32>
+  %1 = tosa.transpose %arg0 { perms = array<i32: 0, 1> }: (tensor<3x4xf32>) -> tensor<3x4xf32>
   return %1 : tensor<3x4xf32>
 }
 
 // CHECK-LABEL: @transpose_nofold
 func.func @transpose_nofold(%arg0: tensor<3x3xf32>) -> tensor<3x3xf32> {
   // CHECK: tosa.transpose
-  %0 = arith.constant dense<[1, 0]> : tensor<2xi32>
-  %1 = tosa.transpose %arg0, %0 { perms = [1, 0] }: (tensor<3x3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
+  %1 = tosa.transpose %arg0 { perms = array<i32: 1, 0> }: (tensor<3x3xf32>) -> tensor<3x3xf32>
   return %1 : tensor<3x3xf32>
 }
 
 // CHECK-LABEL: @transpose_nofold_shape
 func.func @transpose_nofold_shape(%arg0: tensor<3x4xf32>) -> tensor<?x?xf32> {
   // CHECK: tosa.transpose
-  %0 = arith.constant dense<[1, 0]> : tensor<2xi32>
-  %1 = tosa.transpose %arg0, %0 { perms = [1, 0] }: (tensor<3x4xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  %1 = tosa.transpose %arg0 { perms = array<i32: 1, 0> }: (tensor<3x4xf32>) -> tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
 
 // CHECK-LABEL: @transpose_fold_splat
 func.func @transpose_fold_splat() -> tensor<3x2xf32> {
   %input = "tosa.const"() {value = dense<4.0> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
   //               CHECK: %[[CST:.+]] = "tosa.const"() <{
   // CHECK-SAME{LITERAL}: value = dense<4.000000e+00> : tensor<3x2xf32>
-  %1 = tosa.transpose %input, %perms : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  %1 = tosa.transpose %input { perms = array<i32: 1, 0> }: (tensor<2x3xf32>) -> tensor<3x2xf32>
   // CHECK: return %[[CST]]
   return %1 : tensor<3x2xf32>
 }
@@ -55,10 +51,9 @@ func.func @transpose_fold_splat() -> tensor<3x2xf32> {
 // CHECK-LABEL: @transpose_fold_2d_float
 func.func @transpose_fold_2d_float() -> tensor<3x2xf32> {
   %input = "tosa.const"() {value = dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
   //               CHECK: %[[CST:.+]] = "tosa.const"() <{
   // CHECK-SAME{LITERAL}: value = dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32>
-  %1 = tosa.transpose %input, %perms : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  %1 = tosa.transpose %input { perms = array<i32: 1, 0> }: (tensor<2x3xf32>) -> tensor<3x2xf32>
   // CHECK: return %[[CST]]
   return %1 : tensor<3x2xf32>
 }
@@ -66,10 +61,9 @@ func.func @transpose_fold_2d_float() -> tensor<3x2xf32> {
 // CHECK-LABEL: @transpose_fold_2d_bool
 func.func @transpose_fold_2d_bool() -> tensor<3x2xi1> {
   %input = "tosa.const"() {value = dense<[[true, false, false], [false, false, true]]> : tensor<2x3xi1>} : () -> tensor<2x3xi1>
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
   //               CHECK: %[[CST:.+]] = "tosa.const"() <{
   // CHECK-SAME{LITERAL}: value = dense<[[true, false], [false, false], [false, true]]> : tensor<3x2xi1>
-  %1 = tosa.transpose %input, %perms : (tensor<2x3xi1>, tensor<2xi32>) -> tensor<3x2xi1>
+  %1 = tosa.transpose %input { perms = array<i32: 1, 0> }: (tensor<2x3xi1>) -> tensor<3x2xi1>
   // CHECK: return %[[CST]]
   return %1 : tensor<3x2xi1>
 }
@@ -80,50 +74,46 @@ func.func @transpose_fold_4d_int() -> tensor<3x1x4x2xi32> {
     [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
     [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
   ]]> : tensor<1x2x3x4xi32>} : () -> tensor<1x2x3x4xi32>
-  %perms = "tosa.const"() {value = dense<[2, 0, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
   //               CHECK: %[[CST:.+]] = "tosa.const"() <{
   // CHECK-SAME{LITERAL}: value = dense<[
   // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
   // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
   // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
   // CHECK-SAME{LITERAL}: ]>
-  %1 = tosa.transpose %input, %perms : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<3x1x4x2xi32>
+  %1 = tosa.transpose %input { perms = array<i32: 2, 0, 3, 1> }: (tensor<1x2x3x4xi32>) -> tensor<3x1x4x2xi32>
   // CHECK: return %[[CST]]
   return %1 : tensor<3x1x4x2xi32>
 }
 
 // CHECK-LABEL: @transpose_nofold_non_cst_input
 func.func @transpose_nofold_non_cst_input(%input: tensor<2x3xf32>) -> tensor<3x2xf32> {
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK: tosa.transpose
-  %1 = tosa.transpose %input, %perms : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
-  return %1 : tensor<3x2xf32>
-}
-
-// CHECK-LABEL: @transpose_nofold_non_cst_perms
-func.func @transpose_nofold_non_cst_perms(%perms: tensor<2xi32>) -> tensor<3x2xf32> {
-  %input = "tosa.const"() {value = dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-  // CHECK: tosa.transpose
-  %1 = tosa.transpose %input, %perms : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  %1 = tosa.transpose %input { perms = array<i32: 1, 0> }: (tensor<2x3xf32>) -> tensor<3x2xf32>
   return %1 : tensor<3x2xf32>
 }
 
 // CHECK-LABEL: @transpose_nofold_multi_users
 func.func @transpose_nofold_multi_users() -> (tensor<3x2xf32>, tensor<2x3xf32>) {
   %input = "tosa.const"() {value = dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK: tosa.transpose
-  %1 = tosa.transpose %input, %perms : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  %1 = tosa.transpose %input { perms = array<i32: 1, 0> }: (tensor<2x3xf32>) -> tensor<3x2xf32>
   return %1, %input : tensor<3x2xf32>, tensor<2x3xf32>
 }
 
+// CHECK-LABEL: @transpose_nofold_quantized_types
+func.func @transpose_nofold_quantized_types() -> tensor<1x1x2x2x!quant.uniform<i8<-127:127>:f32:3, {1.000000e-01,1.000000e-01}>> {
+  %input = "tosa.const"() {value = dense<-127> : tensor<2x1x1x2xi8>} : () -> tensor<2x1x1x2x!quant.uniform<i8<-127:127>:f32:3, {1.000000e-01,1.000000e-01}>>
+  // CHECK: tosa.transpose
+  %0 = tosa.transpose %input { perms = array<i32: 1, 2, 3, 0> }: (tensor<2x1x1x2x!quant.uniform<i8<-127:127>:f32:3, {1.000000e-01,1.000000e-01}>>) -> tensor<1x1x2x2x!quant.uniform<i8<-127:127>:f32:3, {1.000000e-01,1.000000e-01}>>
+  return %0: tensor<1x1x2x2x!quant.uniform<i8<-127:127>:f32:3, {1.000000e-01,1.000000e-01}>>
+}
+
 // CHECK-LABEL: @transpose_nofold_dense_resource
 func.func @transpose_nofold_dense_resource() -> tensor<2x2xf32> {
   %0 = "tosa.const"() <{value = dense_resource<resource> : tensor<2x2xf32>}> : () -> tensor<2x2xf32>
-  %1 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
 
   // CHECK: tosa.transpose
-  %2 = tosa.transpose %0, %1 : (tensor<2x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  %2 = tosa.transpose %0 { perms = array<i32: 1, 0> }: (tensor<2x2xf32>) -> tensor<2x2xf32>
   return %2 : tensor<2x2xf32>
 }
 {-#
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 1aa8547..9123f84 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -4,7 +4,7 @@
 // validation flow.
 //--------------------------------------------------------------------------------------------------
 
-// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate="profile=pro_int,pro_fp extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable strict-op-spec-alignment"
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate="profile=pro_int,pro_fp extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow strict-op-spec-alignment"
 
 func.func @test_const() -> tensor<1xf32> {
   // expected-error@+1{{'tosa.const' op expected same attr/result element types}}
@@ -235,97 +235,74 @@ func.func @test_pad_padding_shape_mismatch(%arg0: tensor<13x21x3xf32>) -> tensor
 
 // -----
 
-func.func @test_transpose_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21xf32> {
-  // expected-error@+1 {{'tosa.transpose' op perms of transpose is not constant}}
-  %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21xf32>
-  return %0 : tensor<3x13x21xf32>
-}
-
-// -----
-
 func.func @test_transpose_io_rank_mismatch(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21x1xf32> {
   // expected-error@+1 {{'tosa.transpose' op expected input tensor rank to equal result tensor rank}}
-  %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21x1xf32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 2, 1, 0>}: (tensor<13x21x3xf32>) -> tensor<3x13x21x1xf32>
   return %0 : tensor<3x13x21x1xf32>
 }
 
 // -----
 
-func.func @test_transpose_invalid_perms_rank(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3x2xi32>) -> tensor<3x13x21xf32> {
-  // expected-error@+1 {{'tosa.transpose' op expected permutation tensor to be rank 1 but got rank 2}}
-  %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3x2xi32>) -> tensor<3x13x21xf32>
-  return %0 : tensor<3x13x21xf32>
-}
-
-// -----
-
 func.func @test_transpose_rank0_perms() {
   %14 = tensor.empty() : tensor<5x27xi64>
-  %cst = tensor.empty() : tensor<i32>
-  // expected-error@+1 {{'tosa.transpose' op expected permutation tensor to be rank 1 but got rank 0}}
-  %72 = tosa.transpose %14, %cst : (tensor<5x27xi64>, tensor<i32>) -> tensor<?x?xi64>
+  // expected-error@+1 {{'tosa.transpose' op expected perms attribute to have size 2 (input rank) but got size 0}}
+  %72 = tosa.transpose %14 {perms = array<i32> }: (tensor<5x27xi64>) -> tensor<?x?xi64>
   return
 }
 
 // -----
 
-func.func @test_transpose_invalid_perms_size(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7xi32>) -> tensor<3x13x21xf32> {
-  // expected-error@+1 {{'tosa.transpose' op expected permutation tensor dim 0 to have size 3 (input rank) but got size 7}}
-  %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<7xi32>) -> tensor<3x13x21xf32>
+func.func @test_transpose_invalid_perms_size(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
+  // expected-error@+1 {{'tosa.transpose' op expected perms attribute to have size 3 (input rank) but got size 7}}
+  %0 = tosa.transpose %arg0 {perms = array<i32: 6, 5, 4, 3, 2, 1, 0> }: (tensor<13x21x3xf32>) -> tensor<3x13x21xf32>
   return %0 : tensor<3x13x21xf32>
 }
 
 // -----
 
 func.func @test_transpose_invalid_permutation_tensor(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?xf32> {
-  %perms = arith.constant dense<[2, 0, 0]> : tensor<3xi32>
-  // expected-error@+1 {{'tosa.transpose' op expected valid permutation tensor}}
-  %0 = tosa.transpose %arg0, %perms : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
+  // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}}
+  %0 = tosa.transpose %arg0 {perms = array<i32: 2, 0, 0> }: (tensor<13x21x3xf32>) -> tensor<?x?x?xf32>
   return %0 : tensor<?x?x?xf32>
 }
 
 // -----
 
 func.func @test_transpose_invalid_permutation_negative(%arg0: tensor<3x2xi32>) -> tensor<*xi32> {
-  %perms = "tosa.const"() {value = dense<[-1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  // expected-error@+1 {{'tosa.transpose' op expected valid permutation tensor}}
-  %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<*xi32>
+  // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}}
+  %1 = tosa.transpose %arg0 {perms = array<i32: -1, 0> }: (tensor<3x2xi32>) -> tensor<*xi32>
   return %1 : tensor<*xi32>
 }
 
 // -----
 
 func.func @test_transpose_invalid_permutation_tensor_above_range(%arg0: tensor<3x2xi32>) -> tensor<*xi32> {
-  %perms = "tosa.const"() {value = dense<[2, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  // expected-error@+1 {{'tosa.transpose' op expected valid permutation tensor}}
-  %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<*xi32>
+  // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}}
+  %1 = tosa.transpose %arg0 {perms = array<i32: 2, 0> }: (tensor<3x2xi32>) -> tensor<*xi32>
   return %1 : tensor<*xi32>
 }
 
 // -----
 
 func.func @test_transpose_invalid_permutation_types(%arg0: tensor<3x2xi32>) -> tensor<3x4xi32> {
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
   // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 0 to match input dim 1 with value of 2}}
-  %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 1, 0> }: (tensor<3x2xi32>) -> tensor<3x4xi32>
   return %1 : tensor<3x4xi32>
 }
 
 // -----
 
 func.func @test_transpose_invalid_permutation_types_dynamic_dim_ok(%arg0: tensor<2x?xi32>) -> tensor<3x4xi32> {
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
   // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 1 to match input dim 0 with value of 2}}
-  %1 = tosa.transpose %arg0, %perms : (tensor<2x?xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 1, 0> }: (tensor<2x?xi32>) -> tensor<3x4xi32>
   return %1 : tensor<3x4xi32>
 }
 
 // -----
 
 func.func @test_transpose_element_type_mismatch(%arg0: tensor<2x3xi32>) -> tensor<3x2xf32> {
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
   // expected-error@+1 {{'tosa.transpose' op failed to verify that all of {input1, output} have same element type}}
-  %1 = tosa.transpose %arg0, %perms : (tensor<2x3xi32>, tensor<2xi32>) -> tensor<3x2xf32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 1, 0>} : (tensor<2x3xi32>) -> tensor<3x2xf32>
   return %1 : tensor<3x2xf32>
 }
 
@@ -674,10 +651,9 @@ func.func @test_tile_io_rank_mismatch() {
 
 // CHECK-LABEL: @test_invalid_constant_permutation
 func.func @test_invalid_constant_permutation() {
-  // expected-error@+3 {{'tosa.transpose' op expected valid permutation tensor}}
   %0 = tensor.empty() : tensor<3x4x5xi32>
-  %1 = arith.constant dense<[3, 0, 1]> : tensor<3xi32>
-  %2 = tosa.transpose %0, %1 : (tensor<3x4x5xi32>, tensor<3xi32>) -> tensor<3x4x5xi32>
+  // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}}
+  %2 = tosa.transpose %0 {perms = array<i32: 3, 0, 1>}: (tensor<3x4x5xi32>) -> tensor<3x4x5xi32>
   return
 }
 
@@ -685,11 +661,10 @@ func.func @test_invalid_constant_permutation() {
 
 // CHECK-LABEL: test_rank_size_constant_permutation
 func.func @test_rank_size_constant_permutation() {
-  // expected-error@+4 {{'tosa.transpose' op expected valid permutation tensor}}
   %0 = arith.constant 6 : index
-  %1 = arith.constant dense<[0, 2]> : tensor<2xi32>
   %2 = tensor.empty(%0) : tensor<?x27xi64>
-  %3 = tosa.transpose %2, %1 : (tensor<?x27xi64>, tensor<2xi32>) -> tensor<?x27xi64>
+  // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}}
+  %3 = tosa.transpose %2 {perms = array<i32: 0, 2>}: (tensor<?x27xi64>) -> tensor<?x27xi64>
   return
 }
 
@@ -697,11 +672,10 @@ func.func @test_rank_size_constant_permutation() {
 
 // CHECK-LABEL: test_large_constant_permutation
 func.func @test_large_constant_permutation() {
-  // expected-error@+4 {{'tosa.transpose' op expected valid permutation tensor}}
   %0 = arith.constant 6 : index
-  %1 = arith.constant dense<[1185677355, 332462212]> : tensor<2xi32>
   %2 = tensor.empty(%0) : tensor<?x27xi64>
-  %3 = tosa.transpose %2, %1 : (tensor<?x27xi64>, tensor<2xi32>) -> tensor<?x27xi64>
+  // expected-error@+1 {{'tosa.transpose' op expected valid permutation indices}}
+  %3 = tosa.transpose %2 {perms = array<i32: 1185677355, 332462212>}: (tensor<?x27xi64>) -> tensor<?x27xi64>
   return
 }
 
diff --git a/mlir/test/Dialect/Tosa/invalid_extension.mlir b/mlir/test/Dialect/Tosa/invalid_extension.mlir
index 046b9d56..684875f 100644
--- a/mlir/test/Dialect/Tosa/invalid_extension.mlir
+++ b/mlir/test/Dialect/Tosa/invalid_extension.mlir
@@ -2,7 +2,7 @@
 // Enable all supported profiles to focus the verification of expected extension requirement errors.
 //--------------------------------------------------------------------------------------------------
 
-// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_int,pro_fp,mt strict-op-spec-alignment"
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_int,pro_fp strict-op-spec-alignment"
 
 // -----
 func.func @test_fft2d(%arg0: tensor<1x4x8xf32>, %arg1: tensor<1x4x8xf32>) -> (tensor<1x4x8xf32>, tensor<1x4x8xf32>) {
@@ -36,3 +36,37 @@ func.func @test_cast_bf16_i32(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xi32
   return %0 : tensor<13x21x3xi32>
 }
 
+// -----
+func.func @test_cond_if(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> tensor<f32> {
+  // expected-error@+1 {{'tosa.cond_if' op illegal: requires [controlflow]}}
+  %0 = tosa.cond_if %arg2 -> (tensor<f32>) {
+    %1 = tosa.add %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    tosa.yield %1 : tensor<f32>
+  } else {
+    %1 = tosa.sub %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    tosa.yield %1 : tensor<f32>
+  }
+  return %0 : tensor<f32>
+}
+
+// -----
+func.func @test_while_loop(%arg0: tensor<10xi32>, %arg1: tensor<i32>) {
+  %0 = "tosa.const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // expected-error@+1 {{'tosa.while_loop' op illegal: requires [controlflow]}}
+  %1:3 = tosa.while_loop (%arg2 = %0, %arg3 = %0, %arg4 = %arg0) : (tensor<i32>, tensor<i32>, tensor<10xi32>) -> (tensor<i32>, tensor<i32>, tensor<10xi32>) {
+    %2 = tosa.greater_equal %arg3, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %3 = tosa.logical_not %2 : (tensor<i1>) -> tensor<i1>
+    tosa.yield %3 : tensor<i1>
+  } do {
+  ^bb0(%arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<10xi32>):
+    %2 = "tosa.const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %3 = tosa.add %arg3, %2 : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %7 = tosa.const_shape {value = dense<[1]> : tensor<1xindex>} : () -> !tosa.shape<1>
+    %4 = tosa.reshape %2, %7 : (tensor<i32>, !tosa.shape<1>) -> tensor<1xi32>
+    %5 = tosa.add %arg4, %4 : (tensor<10xi32>, tensor<1xi32>) -> tensor<10xi32>
+    %6 = tosa.add %arg2, %2 : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    tosa.yield %6, %3, %5 : tensor<i32>, tensor<i32>, tensor<10xi32>
+  }
+  return
+}
+
diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir
index 90c4551..2a6561f 100644
--- a/mlir/test/Dialect/Tosa/level_check.mlir
+++ b/mlir/test/Dialect/Tosa/level_check.mlir
@@ -2,7 +2,7 @@
 // Enable all supported profiles and extensions to focus the verification of expected level errors.
 //--------------------------------------------------------------------------------------------------
 
-// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate="profile=pro_int,pro_fp,mt extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable"
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate="profile=pro_int,pro_fp extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow"
 
 func.func @test_argmax(%arg0: tensor<1x1x1x1x29x29x4xf32>) -> tensor<1x1x1x1x29x4xi32> {
   // expected-error@+1 {{'tosa.argmax' op failed level check: operand rank(shape) <= MAX_RANK}}
@@ -105,9 +105,8 @@ func.func @test_tile(%arg0: tensor<1x1x1x1x13x21x3xf32>) -> tensor<1x1x1x1x39x21
 // -----
 
 func.func @test_transpose(%arg0: tensor<13x21x3x1x1x1x1xf32>) -> tensor<3x13x21x1x1x1x1xf32> {
-  %0 = "tosa.const"() {value = dense<[2, 0, 1, 3, 4, 5, 6]> : tensor<7xi32>} : () -> tensor<7xi32>
   // expected-error@+1 {{'tosa.transpose' op failed level check: operand rank(shape) <= MAX_RANK}}
-  %1 = "tosa.transpose"(%arg0, %0) : (tensor<13x21x3x1x1x1x1xf32>, tensor<7xi32>) -> tensor<3x13x21x1x1x1x1xf32>
+  %1 = "tosa.transpose"(%arg0) {perms = array<i32: 2, 0, 1, 3, 4, 5, 6>} : (tensor<13x21x3x1x1x1x1xf32>) -> tensor<3x13x21x1x1x1x1xf32>
   return %1 : tensor<3x13x21x1x1x1x1xf32>
 }
 
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index cd73377..fe3d2b0 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// RUN: mlir-opt %s --verify-each | mlir-opt | FileCheck %s
 // RUN: mlir-opt %s --mlir-print-op-generic | mlir-opt | FileCheck %s
 
 
@@ -640,24 +640,21 @@ func.func @test_tile(%arg0: tensor<13x21x3xf32>) -> tensor<39x21x6xf32> {
 // -----
 // CHECK-LABEL: transpose
 func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
-  %0 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-  %1 = tosa.transpose %arg0, %0 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21xf32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 2, 0, 1>} : (tensor<13x21x3xf32>) -> tensor<3x13x21xf32>
   return %1 : tensor<3x13x21xf32>
 }
 
 // -----
 // CHECK-LABEL: transpose_dynamic_dim
 func.func @test_transpose_dynamic_dim(%arg0: tensor<13x?x3xf32>) -> tensor<3x13x?xf32> {
-  %0 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-  %1 = tosa.transpose %arg0, %0 : (tensor<13x?x3xf32>, tensor<3xi32>) -> tensor<3x13x?xf32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 2, 0, 1>} : (tensor<13x?x3xf32>) -> tensor<3x13x?xf32>
   return %1 : tensor<3x13x?xf32>
 }
 
 // -----
 // CHECK-LABEL: transpose_half_dynamic_dim
 func.func @test_transpose_half_dynamic_dim(%arg0: tensor<13x3x3xf32>) -> tensor<3x13x?xf32> {
-  %0 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-  %1 = tosa.transpose %arg0, %0 : (tensor<13x3x3xf32>, tensor<3xi32>) -> tensor<3x13x?xf32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 2, 0, 1>} : (tensor<13x3x3xf32>) -> tensor<3x13x?xf32>
   return %1 : tensor<3x13x?xf32>
 }
 
diff --git a/mlir/test/Dialect/Tosa/profile_all_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_all_unsupported.mlir
index 6dddcf3..8183b58 100644
--- a/mlir/test/Dialect/Tosa/profile_all_unsupported.mlir
+++ b/mlir/test/Dialect/Tosa/profile_all_unsupported.mlir
@@ -2,7 +2,7 @@
 // Enable all supported extensions to focus the verification of expected profile requirement errors.
 //--------------------------------------------------------------------------------------------------
 
-// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable strict-op-spec-alignment"
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow strict-op-spec-alignment"
 
 // -----
 func.func @test_table(%arg0 : tensor<4x5xi8>, %arg1 : tensor<513xi8>) -> () {
diff --git a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir
index c46b254..f7cbd11 100644
--- a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir
+++ b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir
@@ -2,7 +2,7 @@
 // Enable all supported extensions to focus the verification of expected profile requirement errors.
 //--------------------------------------------------------------------------------------------------
 
-// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_int extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable strict-op-spec-alignment"
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_int extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow strict-op-spec-alignment"
 
 // -----
 func.func @test_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<8x1x1x4xf32>, %arg2: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
diff --git a/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir
index 479b756..1d6d33b 100644
--- a/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir
+++ b/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir
@@ -2,7 +2,7 @@
 // Enable all supported extensions to focus the verification of expected profile requirement errors.
 //--------------------------------------------------------------------------------------------------
 
-// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_fp extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable strict-op-spec-alignment"
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_fp extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow strict-op-spec-alignment"
 
 // -----
 func.func @test_table(%arg0 : tensor<4x5xi8>, %arg1 : tensor<513xi8>) -> () {
diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
index ae7a8e9..944dd5f 100644
--- a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
@@ -54,11 +54,10 @@ func.func @transpose_conv2d_quantized_padded(%arg0: tensor<2x16x14x3xi8>, %arg1:
 func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<5x3x5x3xf32>, %arg2: tensor<5xf32>) -> tensor<2x?x?x5xf32> {
   // Manipulate the weight matrix to handle striding.
   // CHECK-DAG: %[[PADV:.+]] = tosa.const_shape {value = dense<[0, 0, 0, 1, 0, 1, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8>
-  // CHECK-DAG: %[[TRANSV:.+]]  = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[PADW:.+]]  = tosa.pad %arg1, %[[PADV]]
   // CHECK-DAG: %[[CONST1:.+]] = tosa.const_shape {value = dense<[5, 2, 2, 2, 3, 3]> : tensor<6xindex>}
   // CHECK-DAG: %[[RESW1:.+]]  = tosa.reshape %[[PADW]], %[[CONST1]]
-  // CHECK-DAG: %[[TRANS:.+]]  = tosa.transpose %[[RESW1]], %[[TRANSV]]
+  // CHECK-DAG: %[[TRANS:.+]]  = tosa.transpose %[[RESW1]] {perms = array<i32: 2, 4, 0, 1, 3, 5>}
   // CHECK-DAG: %[[CONST3:.+]] = tosa.const_shape {value = dense<[30, 2, 2, 3]> : tensor<4xindex>}
   // CHECK-DAG: %[[RESW2:.+]]  = tosa.reshape %[[TRANS]], %[[CONST3]]
   // CHECK-DAG: %[[REV1:.+]]  = tosa.reverse %[[RESW2]] {axis = 1 : i32}
@@ -68,7 +67,6 @@ func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<
 
   // Pad out the input matrix to handle the transpose conv.
   // CHECK-DAG: %[[PAD:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8>
-  // CHECK-DAG: %[[TRANS2:.+]]  = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[NEWINPUT:.+]] = tosa.pad %arg0, %[[PAD]]
 
   // Manipulate the final shape.
@@ -76,7 +74,7 @@ func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<
   // CHECK-DAG: %[[CONV:.+]] = tosa.conv2d %[[NEWINPUT]], %[[NEWWEIGHT]], %[[BIAS]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
   // CHECK-DAG: %[[CONST6:.+]] = tosa.const_shape {value = dense<[2, 18, 16, 2, 3, 5]> : tensor<6xindex>}
   // CHECK-DAG: %[[RESHAPE_OUT_1:.+]] = tosa.reshape %[[CONV]], %[[CONST6]]
-  // CHECK-DAG: %[[TRANS_OUT:.+]] = tosa.transpose %[[RESHAPE_OUT_1]], %[[TRANS2]]
+  // CHECK-DAG: %[[TRANS_OUT:.+]] = tosa.transpose %[[RESHAPE_OUT_1]] {perms = array<i32: 0, 1, 3, 2, 4, 5>}
   // CHECK-DAG: %[[CONST8:.+]] = tosa.const_shape {value = dense<[2, 36, 48, 5]> : tensor<4xindex>
   // CHECK-DAG: %[[RESHAPE_OUT_2:.+]] = tosa.reshape %[[TRANS_OUT]], %[[CONST8]]
   // CHECK-DAG: %[[SLICE:.+]] = tosa.slice %[[RESHAPE_OUT_2]], %[[START]], %[[SIZE]]
@@ -95,11 +93,10 @@ func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<
 func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1: tensor<5x3x5x3xi8>, %arg2: tensor<5xi32>) -> (tensor<2x35x47x5xi32>) {
   // Manipulate the weight matrix to handle striding.
   // CHECK-DAG: %[[PADV:.+]]  = tosa.const_shape {value = dense<[0, 0, 0, 1, 0, 1, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8>
-  // CHECK-DAG: %[[TRANSV:.+]]  = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[PADW:.+]]  = tosa.pad %arg1, %[[PADV]] {input_zp = 42 : i32}
   // CHECK-DAG: %[[CONST1:.+]] = tosa.const_shape {value = dense<[5, 2, 2, 2, 3, 3]> : tensor<6xindex>}
   // CHECK-DAG: %[[RESW1:.+]]  = tosa.reshape %[[PADW]], %[[CONST1]]
-  // CHECK-DAG: %[[TRANS:.+]]  = tosa.transpose %[[RESW1]], %[[TRANSV]]
+  // CHECK-DAG: %[[TRANS:.+]]  = tosa.transpose %[[RESW1]] {perms = array<i32: 2, 4, 0, 1, 3, 5>}
   // CHECK-DAG: %[[CONST3:.+]] = tosa.const_shape {value = dense<[30, 2, 2, 3]> : tensor<4xindex>}
   // CHECK-DAG: %[[RESW2:.+]]  = tosa.reshape %[[TRANS]], %[[CONST3]]
   // CHECK-DAG: %[[REV1:.+]]  = tosa.reverse %[[RESW2]] {axis = 1 : i32}
@@ -109,7 +106,6 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1
 
   // Pad out the input matrix to handle the transpose conv.
   // CHECK-DAG: %[[PAD:.+]]  = tosa.const_shape {value = dense<[0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8>
-  // CHECK-DAG: %[[TRANS2:.+]]  = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[NEWINPUT:.+]] = tosa.pad %arg0, %[[PAD]] {input_zp = -22 : i32}
 
   // Manipulate the final shape.
@@ -119,7 +115,7 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1
   // CHECK-DAG: %[[CONV:.+]] = tosa.conv2d %[[NEWINPUT]], %[[NEWWEIGHT]], %[[BIAS]], %[[INPUT_ZP]], %[[WEIGHT_ZP]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
   // CHECK-DAG: %[[CONV_NEW_SHAPE:.*]] = tosa.const_shape  {value = dense<[2, 18, 16, 2, 3, 5]> : tensor<6xindex>}
   // CHECK-DAG: %[[RESHAPE_OUT_1:.+]] = tosa.reshape %[[CONV]], %[[CONV_NEW_SHAPE]]
-  // CHECK-DAG: %[[TRANS_OUT:.+]] = tosa.transpose %[[RESHAPE_OUT_1]], %[[TRANS2]]
+  // CHECK-DAG: %[[TRANS_OUT:.+]] = tosa.transpose %[[RESHAPE_OUT_1]] {perms = array<i32: 0, 1, 3, 2, 4, 5>}
   // CHECK-DAG: %[[TRANS_NEW_SHAPE:.+]] = tosa.const_shape {value = dense<[2, 36, 48, 5]> : tensor<4xindex>}
   // CHECK-DAG: %[[RESHAPE_OUT_2:.+]] = tosa.reshape %[[TRANS_OUT]], %[[TRANS_NEW_SHAPE]]
   // CHECK-DAG: %[[SLICE:.+]] = tosa.slice %[[RESHAPE_OUT_2]], %[[START]], %[[SIZE]]
@@ -138,12 +134,10 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1
 func.func @transpose_conv2d_strided_overpad(%arg0 : tensor<1x16x1x1xi8>, %arg1 : tensor<1x2x1x1xi8>, %arg2 : tensor<1xi32>) -> (tensor<1x19x2x1xi32>) {
   // CHECK-DAG: %[[WEIGHT_PAD:.+]] = tosa.const_shape  {value = dense<[0, 0, 0, 0, 0, 1, 0, 0]> : tensor<8xindex>}
   // CHECK-DAG: %[[CONST1:.+]] = tosa.const_shape {value = dense<[1, 2, 1, 1, 2, 1]> : tensor<6xindex>}
-  // CHECK-DAG: %[[WEIGHT_PERMS:.+]] = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[CONST3:.+]] = tosa.const_shape {value = dense<[2, 2, 1, 1]> : tensor<4xindex>}
   // CHECK-DAG: %[[INPUT_PAD:.+]] = tosa.const_shape  {value = dense<[0, 0, 1, 1, 0, 0, 0, 0]> : tensor<8xindex>}
   // CHECK-DAG: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor<2xi32>}
   // CHECK-DAG: %[[CONST6:.+]] = tosa.const_shape {value = dense<[1, 17, 1, 1, 2, 1]> : tensor<6xindex>}
-  // CHECK-DAG: %[[RESULT_PERMS:.+]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[CONST8:.+]] = tosa.const_shape {value = dense<[1, 17, 2, 1]> : tensor<4xindex>}
   // CHECK-DAG: %[[RESULT_PAD:.+]] = tosa.const_shape  {value = dense<[0, 0, 2, 0, 0, 0, 0, 0]> : tensor<8xindex>}
   // CHECK-DAG: %[[CONST10:.+]] = tosa.const_shape {value = dense<1> : tensor<4xindex>}
@@ -151,13 +145,13 @@ func.func @transpose_conv2d_strided_overpad(%arg0 : tensor<1x16x1x1xi8>, %arg1 :
   // CHECK-DAG: %[[WEIGHT_ZP:.*]] = "tosa.const"() <{value = dense<93> : tensor<1xi8>}>
   // CHECK: %[[PAD_WEIGHT:.+]] = tosa.pad %arg1, %[[WEIGHT_PAD]] {input_zp = 93 : i32}
   // CHECK: %[[RESHAPE_WEIGHT_0:.+]] = tosa.reshape %[[PAD_WEIGHT]], %[[CONST1]]
-  // CHECK: %[[TRANSPOSE_WEIGHT:.+]] = tosa.transpose %[[RESHAPE_WEIGHT_0]], %[[WEIGHT_PERMS]]
+  // CHECK: %[[TRANSPOSE_WEIGHT:.+]] = tosa.transpose %[[RESHAPE_WEIGHT_0]] {perms = array<i32: 2, 4, 0, 1, 3, 5>}
   // CHECK: %[[RESHAPE_WEIGHT_1:.+]] = tosa.reshape %[[TRANSPOSE_WEIGHT]], %[[CONST3]]
   // CHECK: %[[REVERSE:.+]] = tosa.reverse %[[RESHAPE_WEIGHT_1]] {axis = 1 : i32}
   // CHECK: %[[PAD_INPUT:.+]] = tosa.pad %arg0, %[[INPUT_PAD]] {input_zp = -103 : i32}
   // CHECK: %[[CONV:.+]] = tosa.conv2d %[[PAD_INPUT]], %[[REVERSE]], %[[ZERO]], %[[INPUT_ZP]], %[[WEIGHT_ZP]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
   // CHECK: %[[RESHAPE_RESULT_0:.+]] = tosa.reshape %[[CONV]], %[[CONST6]]
-  // CHECK: %[[TRANSPOSE_RESULT:.+]] = tosa.transpose %[[RESHAPE_RESULT_0]], %[[RESULT_PERMS]]
+  // CHECK: %[[TRANSPOSE_RESULT:.+]] = tosa.transpose %[[RESHAPE_RESULT_0]] {perms = array<i32: 0, 1, 3, 2, 4, 5>}
   // CHECK: %[[RESHAPE_RESULT_1:.+]] = tosa.reshape %[[TRANSPOSE_RESULT]], %[[CONST8]]
   // CHECK: %[[PAD_RESULT:.+]] = tosa.pad %[[RESHAPE_RESULT_1]], %[[RESULT_PAD]]
   // CHECK: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2, %[[CONST10]]
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index b5b0cbf..1821b78 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -577,29 +577,10 @@ func.func @test_tile(%arg0 : tensor<2x3x?xi32>) -> () {
 
 // -----
 
-// CHECK-LABEL: @test_transpose_same
-func.func @test_transpose_same(%arg0 : tensor<4x4x4xi32>, %arg1 : tensor<3xi32>) -> () {
-  // CHECK: tosa.transpose %arg0, %arg1 : (tensor<4x4x4xi32>, tensor<3xi32>) -> tensor<4x4x4xi32>
-  %0 = tosa.transpose %arg0, %arg1 : (tensor<4x4x4xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: @test_transpose_perm_unknown
-func.func @test_transpose_perm_unknown(%arg0 : tensor<4x4x5xi32>, %arg1 : tensor<3xi32>) -> () {
-  // CHECK: tosa.transpose %arg0, %arg1 : (tensor<4x4x5xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
-  %0 = tosa.transpose %arg0, %arg1 : (tensor<4x4x5xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
-  return
-}
-
-// -----
-
 // CHECK-LABEL: @test_transpose_static
 func.func @test_transpose_static(%arg0 : tensor<3x4x5xi32>) -> () {
-  %0 = arith.constant dense<[2, 1, 0]> : tensor<3xi32>
-  // CHECK: tosa.transpose %arg0, %cst : (tensor<3x4x5xi32>, tensor<3xi32>) -> tensor<5x4x3xi32>
-  %1 = tosa.transpose %arg0, %0 : (tensor<3x4x5xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
+  // CHECK: tosa.transpose %arg0 {perms = array<i32: 2, 1, 0>} : (tensor<3x4x5xi32>) -> tensor<5x4x3xi32>
+  %1 = tosa.transpose %arg0 { perms = array<i32: 2, 1, 0> }: (tensor<3x4x5xi32>) -> tensor<?x?x?xi32>
   return
 }
 
diff --git a/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir b/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir
index 3a29300..ed87a6d 100644
--- a/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir
@@ -4,11 +4,9 @@
 // CHECK-NEXT: %[[RESULT:.*]] = tosa.ceil %arg0
 // CHECK-NEXT: return %[[RESULT]]
 func.func @test_transpose_tracks_to_nullifying_single_step(%arg0: tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x3x4x2xi32>
   %ceil = tosa.ceil %0 : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %ceil, %perms1 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %1 = tosa.transpose %ceil {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %1 : tensor<1x2x3x4xi32>
 }
 
@@ -20,13 +18,11 @@ func.func @test_transpose_tracks_to_nullifying_single_step(%arg0: tensor<1x2x3x4
 // CHECK-NEXT: %[[NOT:.*]] = tosa.bitwise_not %[[ABS]]
 // CHECK-NEXT: return %[[NOT]]
 func.func @test_transpose_tracks_to_nullifying_multi_unary_step(%arg0: tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x3x4x2xi32>
   %clamp = tosa.clamp %0 {max_val = 1 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %abs = tosa.abs %clamp : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %bitwise_not = tosa.bitwise_not %abs : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %bitwise_not, %perms1 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %1 = tosa.transpose %bitwise_not {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %1 : tensor<1x2x3x4xi32>
 }
 
@@ -38,14 +34,12 @@ func.func @test_transpose_tracks_to_nullifying_multi_unary_step(%arg0: tensor<1x
 // CHECK-NEXT: %[[ADD:.*]] = tosa.add %[[CLAMP]], %[[ABS]]
 // CHECK-NEXT: return %[[ADD]]
 func.func @test_transpose_tracks_to_nullifying_diverging_binary(%arg0: tensor<1x2x3x4xi32>, %arg1: tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
-  %transpose1 = tosa.transpose %arg1, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
+  %transpose0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x3x4x2xi32>
+  %transpose1 = tosa.transpose %arg1 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x3x4x2xi32>
   %clamp = tosa.clamp %transpose0 {max_val = 1 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %abs = tosa.abs %transpose1 : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %add = tosa.add %clamp, %abs : (tensor<1x3x4x2xi32>, tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %result = tosa.transpose %add, %perms1 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %result = tosa.transpose %add {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %result : tensor<1x2x3x4xi32>
 }
 
@@ -58,14 +52,12 @@ func.func @test_transpose_tracks_to_nullifying_diverging_binary(%arg0: tensor<1x
 // CHECK-NEXT: %[[ADD:.*]] = tosa.add %[[CLAMP]], %[[ABS]]
 // CHECK-NEXT: return %[[ADD]]
 func.func @test_transpose_tracks_to_nullifying_diverging_binary_with_broadcasting(%arg0: tensor<1x2x3x4xi32>, %arg1: tensor<1x2x1x4xi32>) -> tensor<1x2x3x4xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
-  %transpose1 = tosa.transpose %arg1, %perms0 : (tensor<1x2x1x4xi32>, tensor<4xi32>) -> tensor<1x1x4x2xi32>
+  %transpose0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x3x4x2xi32>
+  %transpose1 = tosa.transpose %arg1 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x1x4xi32>) -> tensor<1x1x4x2xi32>
   %clamp = tosa.clamp %transpose0 {max_val = 1 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %abs = tosa.abs %transpose1 : (tensor<1x1x4x2xi32>) -> tensor<1x1x4x2xi32>
   %add = tosa.add %clamp, %abs : (tensor<1x3x4x2xi32>, tensor<1x1x4x2xi32>) -> tensor<1x3x4x2xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %result = tosa.transpose %add, %perms1 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %result = tosa.transpose %add {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %result : tensor<1x2x3x4xi32>
 }
 
@@ -75,11 +67,9 @@ func.func @test_transpose_tracks_to_nullifying_diverging_binary_with_broadcastin
 // CHECK-NEXT: %[[RESULT:.*]] = tosa.add %arg0, %arg0
 // CHECK-NEXT: return %[[RESULT]]
 func.func @test_transpose_tracks_to_nullifying__converging_binary(%arg0: tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x3x4x2xi32>
   %clamp = tosa.add %0, %0 : (tensor<1x3x4x2xi32>, tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %clamp, %perms1 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %1 = tosa.transpose %clamp {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %1 : tensor<1x2x3x4xi32>
 }
 
@@ -102,20 +92,20 @@ func.func @test_torch_conv2d_with_elementwise_in_between(%arg0: tensor<3x3x10x10
     %5 = "tosa.const"() <{value = dense_resource<torch_tensor_3_torch.float32> : tensor<3xf32>}> : () -> tensor<3xf32>
     %6 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32>
     %7 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %8 = tosa.transpose %arg0, %6 : (tensor<3x3x10x10xf32>, tensor<4xi32>) -> tensor<3x10x10x3xf32>
-    %9 = tosa.transpose %4, %6 : (tensor<3x3x2x2xf32>, tensor<4xi32>) -> tensor<3x2x2x3xf32>
+    %8 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<3x3x10x10xf32>) -> tensor<3x10x10x3xf32>
+    %9 = tosa.transpose %4 {perms = array<i32: 0, 2, 3, 1>}: (tensor<3x3x2x2xf32>) -> tensor<3x2x2x3xf32>
     %10 = tosa.conv2d %8, %9, %5 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<3x10x10x3xf32>, tensor<3x2x2x3xf32>, tensor<3xf32>) -> tensor<3x9x9x3xf32>
-    %11 = tosa.transpose %10, %7 : (tensor<3x9x9x3xf32>, tensor<4xi32>) -> tensor<3x3x9x9xf32>
+    %11 = tosa.transpose %10 {perms = array<i32: 0, 3, 1, 2>}: (tensor<3x9x9x3xf32>) -> tensor<3x3x9x9xf32>
     %12 = tosa.ceil %11 : (tensor<3x3x9x9xf32>) -> tensor<3x3x9x9xf32>
-    %13 = tosa.transpose %12, %6 : (tensor<3x3x9x9xf32>, tensor<4xi32>) -> tensor<3x9x9x3xf32>
-    %14 = tosa.transpose %3, %6 : (tensor<3x3x2x2xf32>, tensor<4xi32>) -> tensor<3x2x2x3xf32>
+    %13 = tosa.transpose %12 {perms = array<i32: 0, 2, 3, 1>}: (tensor<3x3x9x9xf32>) -> tensor<3x9x9x3xf32>
+    %14 = tosa.transpose %3 {perms = array<i32: 0, 2, 3, 1>}: (tensor<3x3x2x2xf32>) -> tensor<3x2x2x3xf32>
     %15 = tosa.conv2d %13, %14, %2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<3x9x9x3xf32>, tensor<3x2x2x3xf32>, tensor<3xf32>) -> tensor<3x8x8x3xf32>
-    %16 = tosa.transpose %15, %7 : (tensor<3x8x8x3xf32>, tensor<4xi32>) -> tensor<3x3x8x8xf32>
+    %16 = tosa.transpose %15 {perms = array<i32: 0, 3, 1, 2>}: (tensor<3x8x8x3xf32>) -> tensor<3x3x8x8xf32>
     %17 = tosa.floor %16 : (tensor<3x3x8x8xf32>) -> tensor<3x3x8x8xf32>
-    %18 = tosa.transpose %17, %6 : (tensor<3x3x8x8xf32>, tensor<4xi32>) -> tensor<3x8x8x3xf32>
-    %19 = tosa.transpose %1, %6 : (tensor<3x3x2x2xf32>, tensor<4xi32>) -> tensor<3x2x2x3xf32>
+    %18 = tosa.transpose %17 {perms = array<i32: 0, 2, 3, 1>}: (tensor<3x3x8x8xf32>) -> tensor<3x8x8x3xf32>
+    %19 = tosa.transpose %1 {perms = array<i32: 0, 2, 3, 1>}: (tensor<3x3x2x2xf32>) -> tensor<3x2x2x3xf32>
     %20 = tosa.conv2d %18, %19, %0 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<3x8x8x3xf32>, tensor<3x2x2x3xf32>, tensor<3xf32>) -> tensor<3x7x7x3xf32>
-    %21 = tosa.transpose %20, %7 : (tensor<3x7x7x3xf32>, tensor<4xi32>) -> tensor<3x3x7x7xf32>
+    %21 = tosa.transpose %20 {perms = array<i32: 0, 3, 1, 2>}: (tensor<3x7x7x3xf32>) -> tensor<3x3x7x7xf32>
     return %21 : tensor<3x3x7x7xf32>
 }
 
@@ -126,13 +116,11 @@ func.func @test_torch_conv2d_with_elementwise_in_between(%arg0: tensor<3x3x10x10
 // CHECK-NEXT: %[[RES:.*]] = tosa.mul %arg0, %arg1, %[[SHIFT]]
 // CHECK-NEXT: return %[[RES]]
 func.func @test_mulop_conversion(%arg0: tensor<1x2x3x4xi32>, %arg1: tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
-  %transpose1 = tosa.transpose %arg1, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
+  %transpose0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x3x4x2xi32>
+  %transpose1 = tosa.transpose %arg1 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x3x4x2xi32>
   %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
   %mul = tosa.mul %transpose0, %transpose1, %shift : (tensor<1x3x4x2xi32>, tensor<1x3x4x2xi32>, tensor<1xi8>) -> tensor<1x3x4x2xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %result = tosa.transpose %mul, %perms1 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %result = tosa.transpose %mul {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %result : tensor<1x2x3x4xi32>
 }
 
@@ -141,15 +129,14 @@ func.func @test_mulop_conversion(%arg0: tensor<1x2x3x4xi32>, %arg1: tensor<1x2x3
 // COM: this case is a reshape we don't convert, since can't fold the transpose into it.
 // COM: a transform actually occurs underneath the hood, but it results in identical IR.
 // CHECK-LABEL: @test_basic_non_broadcasting_reshape
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[1, 3, 2]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<[0, 2, 1]> : tensor<3xi32>}>
-// CHECK: %[[VAL_3:.*]] = tosa.reshape %arg0, %[[VAL_1]] : (tensor<2x3xi32>, !tosa.shape<3>) -> tensor<1x3x2xi32>
-// CHECK: %[[VAL_4:.*]] = tosa.transpose %[[VAL_3]], %[[VAL_2]] : (tensor<1x3x2xi32>, tensor<3xi32>) -> tensor<1x2x3xi32>
+// CHECK: %[[SHAPE:.+]] = tosa.const_shape {value = dense<[1, 3, 2]> : tensor<3xindex>}
+// CHECK: %[[RESHAPED:.+]] = tosa.reshape %arg0, %[[SHAPE]] : (tensor<2x3xi32>, !tosa.shape<3>) -> tensor<1x3x2xi32>
+// CHECK: tosa.transpose %[[RESHAPED]] {perms = array<i32: 0, 2, 1>} : (tensor<1x3x2xi32>) -> tensor<1x2x3xi32>
 func.func @test_basic_non_broadcasting_reshape(%arg0: tensor<2x3xi32>) -> tensor<1x2x3xi32> {
   %shape = tosa.const_shape {value = dense<[1, 3, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
   %perms = "tosa.const"() {value = dense<[0, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
   %1 = tosa.reshape %arg0, %shape : (tensor<2x3xi32>, !tosa.shape<3>) -> tensor<1x3x2xi32>
-  %2 = tosa.transpose %1, %perms : (tensor<1x3x2xi32>, tensor<3xi32>) -> tensor<1x2x3xi32>
+  %2 = tosa.transpose %1 {perms = array<i32: 0, 2, 1>}: (tensor<1x3x2xi32>) -> tensor<1x2x3xi32>
   return %2 : tensor<1x2x3xi32>
 }
 
@@ -163,7 +150,7 @@ func.func @test_dynamic_broadcasting_reshape(%arg0: tensor<?xi32>) -> tensor<1x1
   %shape = tosa.const_shape {value = dense<[1, -1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
   %perms = "tosa.const"() {value = dense<[0, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
   %1 = tosa.reshape %arg0, %shape : (tensor<?xi32>, !tosa.shape<3>) -> tensor<1x?x1xi32>
-  %2 = tosa.transpose %1, %perms : (tensor<1x?x1xi32>, tensor<3xi32>) -> tensor<1x1x?xi32>
+  %2 = tosa.transpose %1 {perms = array<i32: 0, 2, 1>}: (tensor<1x?x1xi32>) -> tensor<1x1x?xi32>
   return %2 : tensor<1x1x?xi32>
 }
 
@@ -179,10 +166,9 @@ func.func @test_reshape_for_broadcast(%arg0: tensor<4x3x2xi32>) -> tensor<4x3x2x
   %0 = "tosa.const"() {value = dense<[1,2,3,4]> : tensor<4xi32>} : () -> tensor<4xi32>
   %1 = tosa.const_shape {value = dense<[1, 1, 4]> : tensor<3xindex>} : () -> !tosa.shape<3>
   %reshape = tosa.reshape %0, %1 : (tensor<4xi32>, !tosa.shape<3>) -> tensor<1x1x4xi32>
-  %perms0 = "tosa.const"() {value = dense<[2, 1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-  %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<4x3x2xi32>, tensor<3xi32>) -> tensor<2x3x4xi32>
+  %transpose0 = tosa.transpose %arg0 {perms = array<i32: 2, 1, 0>}: (tensor<4x3x2xi32>) -> tensor<2x3x4xi32>
   %add = tosa.add %transpose0, %reshape : (tensor<2x3x4xi32>, tensor<1x1x4xi32>) -> tensor<2x3x4xi32>
-  %transpose1 = tosa.transpose %add, %perms0 : (tensor<2x3x4xi32>, tensor<3xi32>) -> tensor<4x3x2xi32>
+  %transpose1 = tosa.transpose %add {perms = array<i32: 2, 1, 0>}: (tensor<2x3x4xi32>) -> tensor<4x3x2xi32>
   return %transpose1 : tensor<4x3x2xi32>
 }
 
@@ -223,7 +209,7 @@ func.func @test_resnet18_common_case(%arg0: tensor<64xf32>, %arg1: tensor<64xf32
     %64 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
     %69 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1xf32>}> : () -> tensor<1xf32>
     %70 = "tosa.const"() <{value = dense<5.000000e-01> : tensor<1xf32>}> : () -> tensor<1xf32>
-    %75 = tosa.transpose %74, %64 : (tensor<1x112x112x64xf32>, tensor<4xi32>) -> tensor<1x64x112x112xf32>
+    %75 = tosa.transpose %74 {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x112x112x64xf32>) -> tensor<1x64x112x112xf32>
     %76 = tosa.add %arg1, %69 : (tensor<64xf32>, tensor<1xf32>) -> tensor<64xf32>
     %77 = tosa.pow %76, %70 : (tensor<64xf32>, tensor<1xf32>) -> tensor<64xf32>
     %78 = tosa.reciprocal %77 : (tensor<64xf32>) -> tensor<64xf32>
@@ -236,54 +222,45 @@ func.func @test_resnet18_common_case(%arg0: tensor<64xf32>, %arg1: tensor<64xf32
     %85 = tosa.reshape %59, %58 : (tensor<64xf32>, !tosa.shape<4>) -> tensor<1x64x1x1xf32>
     %86 = tosa.add %84, %85 : (tensor<1x64x112x112xf32>, tensor<1x64x1x1xf32>) -> tensor<1x64x112x112xf32>
     %87 = tosa.clamp %86 {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %88 = tosa.transpose %87, %63 : (tensor<1x64x112x112xf32>, tensor<4xi32>) -> tensor<1x112x112x64xf32>
+    %88 = tosa.transpose %87 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x64x112x112xf32>) -> tensor<1x112x112x64xf32>
     return %88 : tensor<1x112x112x64xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @test_back_to_back_nullifiers
-// CHECK: %[[PERMS:.*]] = "tosa.const"
-// CHECK: %[[RES:.*]] = tosa.transpose %arg0, %[[PERMS]]
+// CHECK: %[[RES:.*]] = tosa.transpose %arg0 {perms = array<i32: 1, 0>}
 // CHECK: return %[[RES]]
 func.func @test_back_to_back_nullifiers(%arg0: tensor<2x3xi32>) -> tensor<3x2xi32> {
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  %0 = tosa.transpose %arg0, %perms : (tensor<2x3xi32>, tensor<2xi32>) -> tensor<3x2xi32>
-  %1 = tosa.transpose %0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<2x3xi32>
-  %2 = tosa.transpose %1, %perms : (tensor<2x3xi32>, tensor<2xi32>) -> tensor<3x2xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 1, 0>}: (tensor<2x3xi32>) -> tensor<3x2xi32>
+  %1 = tosa.transpose %0 {perms = array<i32: 1, 0>}: (tensor<3x2xi32>) -> tensor<2x3xi32>
+  %2 = tosa.transpose %1 {perms = array<i32: 1, 0>}: (tensor<2x3xi32>) -> tensor<3x2xi32>
   return %2 : tensor<3x2xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @test_back_to_back_nullifiers_different_transposes
-// CHECK: %[[PERMS:.*]] = "tosa.const"
-// CHECK: %[[RES:.*]] = tosa.transpose %arg0, %[[PERMS]]
+// CHECK: %[[RES:.*]] = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}
 // CHECK: return %[[RES]]
 func.func @test_back_to_back_nullifiers_different_transposes(%arg0: tensor<2x3x4x5xi32>) -> tensor<2x4x5x3xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %0 = tosa.transpose %arg0, %perms0 : (tensor<2x3x4x5xi32>, tensor<4xi32>) -> tensor<2x4x5x3xi32>
-  %1 = tosa.transpose %0, %perms1 : (tensor<2x4x5x3xi32>, tensor<4xi32>) -> tensor<2x3x4x5xi32>
-  %2 = tosa.transpose %1, %perms0 : (tensor<2x3x4x5xi32>, tensor<4xi32>) -> tensor<2x4x5x3xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<2x3x4x5xi32>) -> tensor<2x4x5x3xi32>
+  %1 = tosa.transpose %0 {perms = array<i32: 0, 3, 1, 2>}: (tensor<2x4x5x3xi32>) -> tensor<2x3x4x5xi32>
+  %2 = tosa.transpose %1 {perms = array<i32: 0, 2, 3, 1>}: (tensor<2x3x4x5xi32>) -> tensor<2x4x5x3xi32>
   return %2 : tensor<2x4x5x3xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @test_no_transform_if_outside_fan_in_cone
-// CHECK: tosa.const
 // CHECK: %[[CLAMP_IN:.*]] = tosa.transpose
 // CHECK: %[[RES2:.*]] = tosa.clamp %[[CLAMP_IN]]
-// CHECK: tosa.const
 // CHECK: %[[RES1:.*]] = tosa.transpose
 // CHECK: return %[[RES1]], %[[RES2]]
 func.func @test_no_transform_if_outside_fan_in_cone(%arg0: tensor<3x3x3x3xi32>) -> (tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>) {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %0 = tosa.transpose %arg0, %perms0 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
   %clamp = tosa.clamp %0  {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
+  %1 = tosa.transpose %clamp {perms = array<i32: 0, 3, 1, 2>} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
   return %1, %clamp : tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>
 }
 
@@ -298,29 +275,25 @@ func.func @test_two_different_downstream_converge_to_reshape_same_perms(%arg0: t
   %shape = tosa.const_shape {value = dense<[1, 64, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
   %1 = tosa.reshape %arg0, %shape : (tensor<64xf32>, !tosa.shape<3>) -> tensor<1x64x1xf32>
   %2 = tosa.clamp %1 {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<1x64x1xf32>) -> tensor<1x64x1xf32>
-  %3 = tosa.transpose %1, %0 : (tensor<1x64x1xf32>, tensor<3xi32>) -> tensor<1x1x64xf32>
-  %4 = tosa.transpose %2, %0 : (tensor<1x64x1xf32>, tensor<3xi32>) -> tensor<1x1x64xf32>
+  %3 = tosa.transpose %1 {perms = array<i32: 0, 2, 1>}: (tensor<1x64x1xf32>) -> tensor<1x1x64xf32>
+  %4 = tosa.transpose %2 {perms = array<i32: 0, 2, 1>}: (tensor<1x64x1xf32>) -> tensor<1x1x64xf32>
   return %3, %4 : tensor<1x1x64xf32>, tensor<1x1x64xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @test_two_different_downstream_converge_to_reshape_different_perms
-// CHECK-DAG: tosa.const
-// CHECK-DAG: tosa.const
 // CHECK-DAG: %[[RESHAPE:.*]] = tosa.reshape
 // CHECK-DAG: %[[CLAMP:.*]] = tosa.clamp %[[RESHAPE]]
 // CHECK-DAG: %[[RET1:.*]] = tosa.transpose
 // CHECK-DAG: %[[RET2:.*]] = tosa.transpose
 // CHECK-DAG: return %[[RET1]], %[[RET2]]
 func.func @test_two_different_downstream_converge_to_reshape_different_perms(%arg0: tensor<64xf32>) -> (tensor<1x1x64xf32>, tensor<64x1x1xf32>) {
-  %0 = "tosa.const"() <{value = dense<[1, 2, 0]> : tensor<3xi32>}> : () -> tensor<3xi32>
-  %1 = "tosa.const"() <{value = dense<[0, 2, 1]> : tensor<3xi32>}> : () -> tensor<3xi32>
   %shape = tosa.const_shape {value = dense<[1, 64, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
   %2 = tosa.reshape %arg0, %shape : (tensor<64xf32>, !tosa.shape<3>) -> tensor<1x64x1xf32>
   %3 = tosa.clamp %2 {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<1x64x1xf32>) -> tensor<1x64x1xf32>
-  %4 = tosa.transpose %2, %1 : (tensor<1x64x1xf32>, tensor<3xi32>) -> tensor<1x1x64xf32>
-  %5 = tosa.transpose %3, %0 : (tensor<1x64x1xf32>, tensor<3xi32>) -> tensor<64x1x1xf32>
+  %4 = tosa.transpose %2 {perms = array<i32: 0, 2, 1>}: (tensor<1x64x1xf32>) -> tensor<1x1x64xf32>
+  %5 = tosa.transpose %3 {perms = array<i32: 1, 2, 0>}: (tensor<1x64x1xf32>) -> tensor<64x1x1xf32>
   return %4, %5 : tensor<1x1x64xf32>, tensor<64x1x1xf32>
 }
 
@@ -328,16 +301,15 @@ func.func @test_two_different_downstream_converge_to_reshape_different_perms(%ar
 
 // COM: no transform
 // CHECK-LABEL: @test_outside_perms_usage_of_fan_in
-// CHECK: tosa.const
 // CHECK: tosa.transpose
 // CHECK: tosa.clamp
 // CHECK: %[[RES1:.*]] = tosa.transpose
 // CHECK: %[[RES2:.*]] = tosa.add
 // CHECK: return %[[RES1]], %[[RES2]]
-func.func @test_outside_perms_usage_of_fan_in(%arg0: tensor<2x3xf32>, %arg1: tensor<3x2xf32>) -> (tensor<2x3xf32>, tensor<3x2xf32>) {        %0 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-  %1 = tosa.transpose %arg0, %0 : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+func.func @test_outside_perms_usage_of_fan_in(%arg0: tensor<2x3xf32>, %arg1: tensor<3x2xf32>) -> (tensor<2x3xf32>, tensor<3x2xf32>) {
+  %1 = tosa.transpose %arg0 {perms = array<i32: 1, 0>}: (tensor<2x3xf32>) -> tensor<3x2xf32>
   %2 = tosa.clamp %1  {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<3x2xf32>) -> tensor<3x2xf32>
-  %3 = tosa.transpose %2, %0 : (tensor<3x2xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+  %3 = tosa.transpose %2 {perms = array<i32: 1, 0>}: (tensor<3x2xf32>) -> tensor<2x3xf32>
   %4 = tosa.add %arg1, %2 : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32>
   return %3, %4: tensor<2x3xf32>, tensor<3x2xf32>
 }
@@ -351,13 +323,12 @@ func.func @test_outside_perms_usage_of_fan_in(%arg0: tensor<2x3xf32>, %arg1: ten
 // CHECK-DAG: %[[NEW_ADD:.*]] = tosa.add %arg1, %[[NEW_CLAMP]]
 // CHECK: return %[[NEW_CLAMP]], %[[NEW_ADD]]
 func.func @test_use_present_in_another_valid_perms_fan_in(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>) -> (tensor<2x3xf32>, tensor<2x3xf32>) {
-  %0 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-  %1 = tosa.transpose %arg0, %0 : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 1, 0>}: (tensor<2x3xf32>) -> tensor<3x2xf32>
   %2 = tosa.clamp %1  {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<3x2xf32>) -> tensor<3x2xf32>
-  %3 = tosa.transpose %2, %0 : (tensor<3x2xf32>, tensor<2xi32>) -> tensor<2x3xf32>
-  %4 = tosa.transpose %arg1, %0 : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  %3 = tosa.transpose %2 {perms = array<i32: 1, 0>}: (tensor<3x2xf32>) -> tensor<2x3xf32>
+  %4 = tosa.transpose %arg1 {perms = array<i32: 1, 0>}: (tensor<2x3xf32>) -> tensor<3x2xf32>
   %5 = tosa.add %4, %2 : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32>
-  %6 = tosa.transpose %5, %0 : (tensor<3x2xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+  %6 = tosa.transpose %5 {perms = array<i32: 1, 0>}: (tensor<3x2xf32>) -> tensor<2x3xf32>
   return %3, %6: tensor<2x3xf32>, tensor<2x3xf32>
 }
 
@@ -365,7 +336,6 @@ func.func @test_use_present_in_another_valid_perms_fan_in(%arg0: tensor<2x3xf32>
 
 // COM: no transform, since we would get duplicates
 // CHECK-LABEL: @test_two_same_perms_fan_in_but_one_doesnt_convert_dependents
-// CHECK: tosa.const
 // CHECK: tosa.transpose
 // CHECK: %[[CEIL:.*]] = tosa.ceil
 // CHECK: %[[ADD:.*]] = tosa.add %[[CEIL]]
@@ -373,12 +343,11 @@ func.func @test_use_present_in_another_valid_perms_fan_in(%arg0: tensor<2x3xf32>
 // CHECK: %[[RES2:.*]] = tosa.transpose %[[ADD]]
 // CHECK: return %[[RES1]], %[[RES2]]
 func.func @test_two_same_perms_fan_in_but_one_doesnt_convert_dependents(%arg0: tensor<2x3xi32>, %arg1: tensor<3x2xi32>) -> (tensor<2x3xi32>, tensor<2x3xi32>) {
-  %0 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
-  %1 = tosa.transpose %arg0, %0 : (tensor<2x3xi32>, tensor<2xi32>) -> tensor<3x2xi32>
+  %1 = tosa.transpose %arg0 {perms = array<i32: 1, 0>}: (tensor<2x3xi32>) -> tensor<3x2xi32>
   %2 = tosa.ceil %1 : (tensor<3x2xi32>) -> tensor<3x2xi32>
   %3 = tosa.add %2, %arg1 : (tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
-  %4 = tosa.transpose %2, %0 : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<2x3xi32>
-  %5 = tosa.transpose %3, %0 : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<2x3xi32>
+  %4 = tosa.transpose %2 {perms = array<i32: 1, 0>}: (tensor<3x2xi32>) -> tensor<2x3xi32>
+  %5 = tosa.transpose %3 {perms = array<i32: 1, 0>}: (tensor<3x2xi32>) -> tensor<2x3xi32>
   return %4, %5 : tensor<2x3xi32>, tensor<2x3xi32>
 }
 
@@ -388,12 +357,10 @@ func.func @test_two_same_perms_fan_in_but_one_doesnt_convert_dependents(%arg0: t
 // CHECK-NEXT: %[[RES:.*]] = tosa.clamp %arg0
 // CHECK-NEXT: return %[[RES]], %[[RES]]
 func.func @test_direct_use_in_other_transpose_with_same_perms(%arg0: tensor<3x3x3x3xi32>) -> (tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>) {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %0 = tosa.transpose %arg0, %perms0 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
   %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
-  %2 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
+  %1 = tosa.transpose %clamp {perms = array<i32: 0, 3, 1, 2>}: (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
+  %2 = tosa.transpose %clamp {perms = array<i32: 0, 3, 1, 2>}: (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
   return %1, %2 : tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>
 }
 
@@ -405,8 +372,7 @@ func.func @test_direct_use_in_other_transpose_with_same_perms(%arg0: tensor<3x3x
 // CHECK: return %[[NEW]]
 func.func @test_const_transpose() -> tensor<2x3xi32> {
   %0 = "tosa.const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  %1 = tosa.transpose %0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<2x3xi32>
+  %1 = tosa.transpose %0 {perms = array<i32: 1, 0>}: (tensor<3x2xi32>) -> tensor<2x3xi32>
   return %1 : tensor<2x3xi32>
 }
 
@@ -420,8 +386,7 @@ func.func @test_const_transpose() -> tensor<2x3xi32> {
 func.func @test_transpose_tracks_to_const_single_step() -> tensor<1x2x3x4xi32> {
   %0 = "tosa.const"() {value = dense<0> : tensor<1x3x4x2xi32>} : () -> tensor<1x3x4x2xi32>
   %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %clamp, %perms1 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %1 = tosa.transpose %clamp {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %1 : tensor<1x2x3x4xi32>
 }
 
@@ -434,13 +399,11 @@ func.func @test_transpose_tracks_to_const_single_step() -> tensor<1x2x3x4xi32> {
 // CHECK: %[[NEW_NOT:.*]] = tosa.bitwise_not %[[NEW_ABS]] : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32>
 // CHECK: return %[[NEW_NOT]]
 func.func @test_static_unary_path_to_const() -> tensor<1x2x3x4xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
   %0 = "tosa.const"() {value = dense<1> : tensor<1x3x4x2xi32>} : () -> tensor<1x3x4x2xi32>
   %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %abs = tosa.abs %clamp : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %bitwise_not = tosa.bitwise_not %abs : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %bitwise_not, %perms1 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %1 = tosa.transpose %bitwise_not {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %1 : tensor<1x2x3x4xi32>
 }
 
@@ -455,16 +418,14 @@ func.func @test_static_unary_path_to_const() -> tensor<1x2x3x4xi32> {
 // CHECK: %[[NEW_ADD:.*]] = tosa.add %[[NEW_ABS]], %[[NEW_CLAMP]] : (tensor<1x2x3x4xi32>, tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32>
 // CHECK: return %[[NEW_ADD]]
 func.func @test_static_diverges_to_non_splat_const_and_nullifying(%arg0: tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
+  %transpose0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x3x4x2xi32>
   %const = "tosa.const"() {value = dense<[[[[1, 2], [3, 4], [5, 6], [7, 8]],
    [[9, 10], [11, 12], [13, 14], [15, 16]],
    [[17, 18], [19, 20], [21, 22], [23, 24]]]]> : tensor<1x3x4x2xi32>} : () -> tensor<1x3x4x2xi32>
   %clamp = tosa.clamp %transpose0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %abs = tosa.abs %const : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %add = tosa.add %abs, %clamp : (tensor<1x3x4x2xi32>, tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
-  %perms2 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %result = tosa.transpose %add, %perms2 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %result = tosa.transpose %add {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %result : tensor<1x2x3x4xi32>
 }
 
@@ -474,12 +435,10 @@ func.func @test_static_diverges_to_non_splat_const_and_nullifying(%arg0: tensor<
 // CHECK-NEXT: %[[RES:.*]] = tosa.clamp %arg0
 // CHECK-NEXT: return %[[RES]], %[[RES]]
 func.func @test_multi_downstream_both_nullify(%arg0: tensor<3x3x3x3xi32>) -> (tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>) {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %0 = tosa.transpose %arg0, %perms0 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
   %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
-  %2 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
+  %1 = tosa.transpose %clamp {perms = array<i32: 0, 3, 1, 2>}: (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
+  %2 = tosa.transpose %clamp {perms = array<i32: 0, 3, 1, 2>}: (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
   return %1, %2 : tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>
 }
 
@@ -487,19 +446,15 @@ func.func @test_multi_downstream_both_nullify(%arg0: tensor<3x3x3x3xi32>) -> (te
 
 // COM: we don't perform this transformation intentionally, since we would then get duplicates
 // CHECK-LABEL: @test_multi_downstream_one_nullifies_upstream_other_does_not
-// CHECK: tosa.const
 // CHECK: tosa.transpose
 // CHECK: tosa.clamp
-// CHECK: tosa.const
 // CHECK: tosa.transpose
 // CHECK: tosa.transpose
 func.func @test_multi_downstream_one_nullifies_upstream_other_does_not(%arg0: tensor<3x3x3x3xi32>) -> (tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>) {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %0 = tosa.transpose %arg0, %perms0 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
   %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
-  %2 = tosa.transpose %clamp, %perms0 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32>
+  %1 = tosa.transpose %clamp {perms = array<i32: 0, 3, 1, 2>}: (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
+  %2 = tosa.transpose %clamp {perms = array<i32: 0, 2, 3, 1>}: (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32>
   return %1, %2 : tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>
 }
 
@@ -508,9 +463,8 @@ func.func @test_multi_downstream_one_nullifies_upstream_other_does_not(%arg0: te
 // CHECK-LABEL: @test_unknown_dim_inner_replacement_matches
 // CHECK-NEXT: return %arg0
 func.func @test_unknown_dim_inner_replacement_matches(%arg0: tensor<3x2xi32>) -> tensor<3x2xi32> {
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  %0 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<?x3xi32>
-  %1 = tosa.transpose %0, %perms : (tensor<?x3xi32>, tensor<2xi32>) -> tensor<3x2xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 1, 0>}: (tensor<3x2xi32>) -> tensor<?x3xi32>
+  %1 = tosa.transpose %0 {perms = array<i32: 1, 0>}: (tensor<?x3xi32>) -> tensor<3x2xi32>
   return %1 : tensor<3x2xi32>
 }
 
@@ -520,9 +474,8 @@ func.func @test_unknown_dim_inner_replacement_matches(%arg0: tensor<3x2xi32>) ->
 // CHECK-LABEL: @test_unknown_dim_outer_replacement_matches
 // CHECK-NEXT: return %arg0
 func.func @test_unknown_dim_outer_replacement_matches(%arg0: tensor<3x?xi32>) -> tensor<3x?xi32> {
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  %0 = tosa.transpose %arg0, %perms : (tensor<3x?xi32>, tensor<2xi32>) -> tensor<2x3xi32>
-  %1 = tosa.transpose %0, %perms : (tensor<2x3xi32>, tensor<2xi32>) -> tensor<3x?xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 1, 0>}: (tensor<3x?xi32>) -> tensor<2x3xi32>
+  %1 = tosa.transpose %0 {perms = array<i32: 1, 0>}: (tensor<2x3xi32>) -> tensor<3x?xi32>
   return %1 : tensor<3x?xi32>
 }
 
@@ -534,47 +487,28 @@ func.func @test_unknown_dim_outer_replacement_matches(%arg0: tensor<3x?xi32>) ->
 // CHECK-NEXT: %[[ADD:.*]] = tosa.add %[[CLAMP]], %[[ABS]]
 // CHECK-NEXT: return %[[ADD]]
 func.func @test_transpose_tracks_to_nullifying_diverging_binary_unknown_dim_replacements_match(%arg0: tensor<1x?x3x4xi32>, %arg1: tensor<1x2x?x4xi32>) -> tensor<1x2x3x4xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<1x?x3x4xi32>, tensor<4xi32>) -> tensor<?x3x4x?xi32>
-  %transpose1 = tosa.transpose %arg1, %perms0 : (tensor<1x2x?x4xi32>, tensor<4xi32>) -> tensor<1x?x?x2xi32>
+  %transpose0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x?x3x4xi32>) -> tensor<?x3x4x?xi32>
+  %transpose1 = tosa.transpose %arg1 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x?x4xi32>) -> tensor<1x?x?x2xi32>
   %clamp = tosa.clamp %transpose0 {min_val = 0 : i32, max_val = 1 : i32} : (tensor<?x3x4x?xi32>) -> tensor<?x3x4x?xi32>
   %abs = tosa.abs %transpose1 : (tensor<1x?x?x2xi32>) -> tensor<1x?x?x2xi32>
   %add = tosa.add %clamp, %abs : (tensor<?x3x4x?xi32>, tensor<1x?x?x2xi32>) -> tensor<1x3x4x2xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %result = tosa.transpose %add, %perms1 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %result = tosa.transpose %add {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %result : tensor<1x2x3x4xi32>
 }
 
 // -----
 
-// COM: we cannot do anything to the transpose in this case.
-// CHECK-LABEL: @test_unimplemented_non_const_perms
-// CHECK: tosa.const
-// CHECK-NEXT: tosa.transpose
-// CHECK-NEXT: return
-func.func @test_unimplemented_non_const_perms(%perms: tensor<2xi32>) -> tensor<?x?xi32> {
-  %0 = "tosa.const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
-  %1 = tosa.transpose %0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<?x?xi32>
-  return %1 : tensor<?x?xi32>
-}
-
-// -----
-
 // COM: due to tracking back to a non-nullifying transpose, we can't get rid of the transposes entirely.
 // COM: later editions of the pass may wish to fold these into a single transpose.
 // CHECK-LABEL: @test_unimplemented_transpose_tracks_to_non_nullifying_transpose_single_step
-// CHECK: tosa.const
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: tosa.clamp
-// CHECK-NEXT: tosa.const
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: return
 func.func @test_unimplemented_transpose_tracks_to_non_nullifying_transpose_single_step(%arg0: tensor<1x2x3x4xi32>) -> tensor<1x2x4x3xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 3, 2, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x4x3x2xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 0, 3, 2, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x4x3x2xi32>
   %clamp = tosa.clamp %0 {min_val = 0 : i32, max_val = 1 : i32} : (tensor<1x4x3x2xi32>) -> tensor<1x4x3x2xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = tosa.transpose %clamp, %perms1 : (tensor<1x4x3x2xi32>, tensor<4xi32>) -> tensor<1x2x4x3xi32>
+  %1 = tosa.transpose %clamp {perms = array<i32: 0, 3, 1, 2>} : (tensor<1x4x3x2xi32>) -> tensor<1x2x4x3xi32>
   return %1 : tensor<1x2x4x3xi32>
 }
 
@@ -582,28 +516,24 @@ func.func @test_unimplemented_transpose_tracks_to_non_nullifying_transpose_singl
 
 // COM: we don't deal with this case. resolution of shapes required.
 // CHECK-LABEL: @test_unimplemented_unknown_dim_input_nullifying_pair
-// CHECK: tosa.const
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: return
 func.func @test_unimplemented_unknown_dim_input_nullifying_pair(%arg0: tensor<3x?xi32>) -> tensor<3x2xi32> {
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  %0 = tosa.transpose %arg0, %perms : (tensor<3x?xi32>, tensor<2xi32>) -> tensor<2x3xi32>
-  %1 = tosa.transpose %0, %perms : (tensor<2x3xi32>, tensor<2xi32>) -> tensor<3x2xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 1, 0>}: (tensor<3x?xi32>) -> tensor<2x3xi32>
+  %1 = tosa.transpose %0 {perms = array<i32: 1, 0>}: (tensor<2x3xi32>) -> tensor<3x2xi32>
   return %1 : tensor<3x2xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @test_unimplemented_unknown_dim_replacement_does_not_match
-// CHECK: tosa.const
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: return
 func.func @test_unimplemented_unknown_dim_replacement_does_not_match(%arg0: tensor<3x?xi32>) -> tensor<?x?xi32> {
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  %0 = tosa.transpose %arg0, %perms : (tensor<3x?xi32>, tensor<2xi32>) -> tensor<?x3xi32>
-  %1 = tosa.transpose %0, %perms : (tensor<?x3xi32>, tensor<2xi32>) -> tensor<?x?xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 1, 0>}: (tensor<3x?xi32>) -> tensor<?x3xi32>
+  %1 = tosa.transpose %0 {perms = array<i32: 1, 0>}: (tensor<?x3xi32>) -> tensor<?x?xi32>
   return %1 : tensor<?x?xi32>
 }
 
@@ -611,53 +541,43 @@ func.func @test_unimplemented_unknown_dim_replacement_does_not_match(%arg0: tens
 
 // COM: this would be able to be converted if --tosa-infer-shapes was run beforehand
 // CHECK-LABEL: @test_unimplemented_unranked_tensors_present
-// CHECK: tosa.const
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: return
 func.func @test_unimplemented_unranked_tensors_present(%arg0: tensor<3x2xi32>) -> tensor<*xi32> {
-  %perms = "tosa.const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
-  %0 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<*xi32>
-  %1 = tosa.transpose %0, %perms : (tensor<*xi32>, tensor<2xi32>) -> tensor<*xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 0, 1>}: (tensor<3x2xi32>) -> tensor<*xi32>
+  %1 = tosa.transpose %0 {perms = array<i32: 0, 1>}: (tensor<*xi32>) -> tensor<*xi32>
   return %1 : tensor<*xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @test_unimplemented_unranked_everything
-// CHECK: tosa.const
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: return
 func.func @test_unimplemented_unranked_everything(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  %0 = tosa.transpose %arg0, %perms : (tensor<*xi32>, tensor<2xi32>) -> tensor<*xi32>
-  %1 = tosa.transpose %0, %perms : (tensor<*xi32>, tensor<2xi32>) -> tensor<*xi32>
+  %0 = tosa.transpose %arg0 {perms = array<i32: 1, 0>}: (tensor<*xi32>) -> tensor<*xi32>
+  %1 = tosa.transpose %0 {perms = array<i32: 1, 0>}: (tensor<*xi32>) -> tensor<*xi32>
   return %1 : tensor<*xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @test_unimplemented_static_diverges_to_one_nullifying_one_non_nullifying
-// CHECK: tosa.const
-// CHECK-NEXT: tosa.const
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: tosa.clamp
 // CHECK-NEXT: tosa.abs
 // CHECK-NEXT: tosa.add
-// CHECK-NEXT: tosa.const
 // CHECK-NEXT: tosa.transpose
 // CHECK-NEXT: return
 func.func @test_unimplemented_static_diverges_to_one_nullifying_one_non_nullifying(%arg0: tensor<1x2x3x4xi32>, %arg1: tensor<1x2x4x3xi32>) -> tensor<1x2x3x4xi32> {
-  %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %perms1 = "tosa.const"() {value = dense<[0, 3, 2, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
-  %transpose1 = tosa.transpose %arg1, %perms1 : (tensor<1x2x4x3xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32>
+  %transpose0 = tosa.transpose %arg0 {perms = array<i32: 0, 2, 3, 1>}: (tensor<1x2x3x4xi32>) -> tensor<1x3x4x2xi32>
+  %transpose1 = tosa.transpose %arg1 {perms = array<i32: 0, 3, 2, 1>}: (tensor<1x2x4x3xi32>) -> tensor<1x3x4x2xi32>
   %clamp = tosa.clamp %transpose0 {min_val = 0 : i32, max_val = 1 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %abs = tosa.abs %transpose1 : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
   %add = tosa.add %clamp, %abs : (tensor<1x3x4x2xi32>, tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32>
-  %perms2 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %result = tosa.transpose %add, %perms2 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  %result = tosa.transpose %add {perms = array<i32: 0, 3, 1, 2>}: (tensor<1x3x4x2xi32>) -> tensor<1x2x3x4xi32>
   return %result : tensor<1x2x3x4xi32>
 }
diff --git a/mlir/test/Dialect/Tosa/transpose-fold.mlir b/mlir/test/Dialect/Tosa/transpose-fold.mlir
index a7e9982..d29cb54 100644
--- a/mlir/test/Dialect/Tosa/transpose-fold.mlir
+++ b/mlir/test/Dialect/Tosa/transpose-fold.mlir
@@ -6,10 +6,8 @@
 // CHECK:         }
 
 func.func @test_cancel_transpose_transpose(%arg0: tensor<1x2x3xi32>) -> (tensor<1x2x3xi32>) {
-	%0 = "tosa.const"() {value = dense<[1, 2, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-	%1 = tosa.transpose %arg0, %0 : (tensor<1x2x3xi32>, tensor<3xi32>) -> tensor<2x3x1xi32>
-	%2 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
-	%3 = tosa.transpose %1, %2 : (tensor<2x3x1xi32>, tensor<3xi32>) -> tensor<1x2x3xi32>
+	%1 = tosa.transpose %arg0 { perms = array<i32: 1, 2, 0> }: (tensor<1x2x3xi32>) -> tensor<2x3x1xi32>
+	%3 = tosa.transpose %1 { perms = array<i32: 2, 0, 1> }: (tensor<2x3x1xi32>) -> tensor<1x2x3xi32>
   return %3 : tensor<1x2x3xi32>
 }
 
@@ -21,8 +19,7 @@ func.func @test_cancel_transpose_transpose(%arg0: tensor<1x2x3xi32>) -> (tensor<
 // CHECK:         }
 
 func.func @test_remove_identity_transpose(%arg0: tensor<1x2x3xi32>) -> (tensor<1x2x3xi32>) {
-	%0 = "tosa.const"() {value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
-	%1 = tosa.transpose %arg0, %0 : (tensor<1x2x3xi32>, tensor<3xi32>) -> tensor<1x2x3xi32>
+	%1 = tosa.transpose %arg0 { perms = array<i32: 0, 1, 2> }: (tensor<1x2x3xi32>) -> tensor<1x2x3xi32>
   return %1 : tensor<1x2x3xi32>
 }
 
@@ -30,16 +27,13 @@ func.func @test_remove_identity_transpose(%arg0: tensor<1x2x3xi32>) -> (tensor<1
 
 // CHECK-LABEL:   func.func @test_do_not_cancel_different_transpose(
 // CHECK-SAME:                                                      %[[VAL_0:.*]]: tensor<2x3x4x5xi32>) -> tensor<5x4x3x2xi32> {
-// CHECK:           %[[VAL_1:.*]] = "tosa.const"() <{value = dense<[3, 2, 1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
-// CHECK:           %[[VAL_2:.*]] = tosa.transpose %[[VAL_0]], %[[VAL_1]] : (tensor<2x3x4x5xi32>, tensor<4xi32>) -> tensor<5x4x3x2xi32>
+// CHECK:           %[[VAL_2:.*]] = tosa.transpose %[[VAL_0]] {perms = array<i32: 3, 2, 1, 0>} : (tensor<2x3x4x5xi32>) -> tensor<5x4x3x2xi32>
 // CHECK:           return %[[VAL_2]] : tensor<5x4x3x2xi32>
 // CHECK:         }
 
 func.func @test_do_not_cancel_different_transpose(%arg0: tensor<2x3x4x5xi32>) -> (tensor<5x4x3x2xi32>) {
-	%0 = "tosa.const"() {value = dense<[1, 2, 0, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
-	%1 = tosa.transpose %arg0, %0 : (tensor<2x3x4x5xi32>, tensor<4xi32>) -> tensor<3x4x2x5xi32>
-	%2 = "tosa.const"() {value = dense<[3, 1, 0, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-	%3 = tosa.transpose %1, %2 : (tensor<3x4x2x5xi32>, tensor<4xi32>) -> tensor<5x4x3x2xi32>
+	%1 = tosa.transpose %arg0 { perms = array<i32: 1, 2, 0, 3> }: (tensor<2x3x4x5xi32>) -> tensor<3x4x2x5xi32>
+	%3 = tosa.transpose %1 { perms = array<i32: 3, 1, 0, 2> }: (tensor<3x4x2x5xi32>) -> tensor<5x4x3x2xi32>
   return %3 : tensor<5x4x3x2xi32>
 }
 
@@ -47,15 +41,12 @@ func.func @test_do_not_cancel_different_transpose(%arg0: tensor<2x3x4x5xi32>) ->
 
 // CHECK-LABEL:   func.func @test_prefer_compose_transpose(
 // CHECK-SAME:                                                      %[[VAL_0:.*]]: tensor<1x2x3x4xi32>) -> tensor<4x3x2x1xi32> {
-// CHECK:           %[[VAL_1:.*]] = "tosa.const"() <{value = dense<[3, 2, 1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
-// CHECK:           %[[VAL_2:.*]] = tosa.transpose %[[VAL_0]], %[[VAL_1]] : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<4x3x2x1xi32>
+// CHECK:           %[[VAL_2:.*]] = tosa.transpose %[[VAL_0]] {perms = array<i32: 3, 2, 1, 0>} : (tensor<1x2x3x4xi32>) -> tensor<4x3x2x1xi32>
 // CHECK:           return %[[VAL_2]] : tensor<4x3x2x1xi32>
 // CHECK:         }
 
 func.func @test_prefer_compose_transpose(%arg0: tensor<1x2x3x4xi32>) -> (tensor<4x3x2x1xi32>) {
-	%0 = "tosa.const"() {value = dense<[1, 2, 0, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
-	%1 = tosa.transpose %arg0, %0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<2x3x1x4xi32>
-	%2 = "tosa.const"() {value = dense<[3, 1, 0, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-	%3 = tosa.transpose %1, %2 : (tensor<2x3x1x4xi32>, tensor<4xi32>) -> tensor<4x3x2x1xi32>
+	%1 = tosa.transpose %arg0 { perms = array<i32: 1, 2, 0, 3> }: (tensor<1x2x3x4xi32>) -> tensor<2x3x1x4xi32>
+	%3 = tosa.transpose %1 { perms = array<i32: 3, 1, 0, 2> }: (tensor<2x3x1x4xi32>) -> tensor<4x3x2x1xi32>
   return %3 : tensor<4x3x2x1xi32>
 }
diff --git a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
index dfc79a1..3ae1883 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir
@@ -363,9 +363,30 @@ func.func @xfer_read_minor_identity_transposed_masked_scalable(
 }
 
 ///----------------------------------------------------------------------------------------
-/// vector.transfer_read
+/// [Pattern: TransferOpReduceRank]
+///
+/// IN: vector.transfer_read (minor identity map + broadcast)
+/// OUT: vector.transfer_read + vector.broadcast
 ///----------------------------------------------------------------------------------------
-/// TODO: Review and categorize
+
+// CHECK-LABEL:   func.func @xfer_read_minor_identitiy_bcast_dims
+//  CHECK-SAME:     %[[MEM:.*]]: memref<?x?x?x?xf32>, %[[IDX:.*]]: index) -> vector<8x4x2x3xf32> {
+//       CHECK:     %[[T_READ:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]], %[[IDX]], %[[IDX]]]{{.*}} permutation_map = #[[$MAP]]} : memref<?x?x?x?xf32>, vector<4x2x3xf32>
+//       CHECK:     %[[BC:.*]] = vector.broadcast %[[T_READ]] : vector<4x2x3xf32> to vector<8x4x2x3xf32>
+//       CHECK:     return %[[BC]] : vector<8x4x2x3xf32>
+func.func @xfer_read_minor_identitiy_bcast_dims(
+    %mem: memref<?x?x?x?xf32>,
+    %idx: index) -> vector<8x4x2x3xf32> {
+
+  %pad = arith.constant 0.000000e+00 : f32
+
+  %res = vector.transfer_read %mem[%idx, %idx, %idx, %idx], %pad {
+    in_bounds = [true, true, true, true],
+    permutation_map = affine_map<(d0, d1, d2, d3) -> (0, d1, 0, d3)>
+  } : memref<?x?x?x?xf32>, vector<8x4x2x3xf32>
+
+  return %res : vector<8x4x2x3xf32>
+}
 
 // CHECK-LABEL:   func.func @xfer_read_minor_identitiy_bcast_dims_scalable
 //  CHECK-SAME:     %[[MEM:.*]]: memref<?x?x?x?xf32>, %[[IDX:.*]]: index) -> vector<8x[4]x2x3xf32> {
@@ -373,7 +394,8 @@ func.func @xfer_read_minor_identity_transposed_masked_scalable(
 //       CHECK:     %[[BC:.*]] = vector.broadcast %[[T_READ]] : vector<[4]x2x3xf32> to vector<8x[4]x2x3xf32>
 //       CHECK:     return %[[BC]] : vector<8x[4]x2x3xf32>
 func.func @xfer_read_minor_identitiy_bcast_dims_scalable(
-    %mem: memref<?x?x?x?xf32>, %idx: index) -> vector<8x[4]x2x3xf32> {
+    %mem: memref<?x?x?x?xf32>,
+    %idx: index) -> vector<8x[4]x2x3xf32> {
 
   %pad = arith.constant 0.000000e+00 : f32
 
@@ -385,18 +407,64 @@ func.func @xfer_read_minor_identitiy_bcast_dims_scalable(
   return %res : vector<8x[4]x2x3xf32>
 }
 
+// CHECK-LABEL:   func.func @xfer_read_minor_identitiy_bcast_dims_with_mask
+//  CHECK-SAME:     %[[MEM:.*]]: memref<?x?x?x?xf32>
+//  CHECK-SAME:     %[[MASK:.*]]: vector<4x3xi1>
+//  CHECK-SAME:     %[[IDX:.*]]: index) -> vector<8x4x2x3xf32>
+//       CHECK:     %[[PASS_THROUGH:.*]] = arith.constant 0.000000e+00 : f32
+//       CHECK:     %[[T_READ:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]], %[[IDX]], %[[IDX]]], %[[PASS_THROUGH]], %[[MASK]]{{.*}} permutation_map = #[[$MAP]]} : memref<?x?x?x?xf32>, vector<4x2x3xf32>
+//       CHECK:     %[[BC:.*]] = vector.broadcast %[[T_READ]] : vector<4x2x3xf32> to vector<8x4x2x3xf32>
+//       CHECK:     return %[[BC]] : vector<8x4x2x3xf32>
+func.func @xfer_read_minor_identitiy_bcast_dims_with_mask(
+    %mem: memref<?x?x?x?xf32>,
+    %mask: vector<4x3xi1>,
+    %idx: index) -> vector<8x4x2x3xf32> {
+
+  %pad = arith.constant 0.000000e+00 : f32
+
+  %res = vector.transfer_read %mem[%idx, %idx, %idx, %idx], %pad, %mask {
+    in_bounds = [true, true, true, true],
+    permutation_map = affine_map<(d0, d1, d2, d3) -> (0, d1, 0, d3)>
+  } : memref<?x?x?x?xf32>, vector<8x4x2x3xf32>
+
+  return %res : vector<8x4x2x3xf32>
+}
+
+// CHECK-LABEL:   func.func @xfer_read_minor_identitiy_bcast_dims_with_mask_scalable
+//  CHECK-SAME:     %[[MEM:.*]]: memref<?x?x?x?xf32>
+//  CHECK-SAME:     %[[MASK:.*]]: vector<[4]x3xi1>
+//  CHECK-SAME:     %[[IDX:.*]]: index) -> vector<8x[4]x2x3xf32>
+//       CHECK:     %[[PASS_THROUGH:.*]] = arith.constant 0.000000e+00 : f32
+//       CHECK:     %[[T_READ:.*]] = vector.transfer_read %[[MEM]][%[[IDX]], %[[IDX]], %[[IDX]], %[[IDX]]], %[[PASS_THROUGH]], %[[MASK]]{{.*}} permutation_map = #[[$MAP]]} : memref<?x?x?x?xf32>, vector<[4]x2x3xf32>
+//       CHECK:     %[[BC:.*]] = vector.broadcast %[[T_READ]] : vector<[4]x2x3xf32> to vector<8x[4]x2x3xf32>
+//       CHECK:     return %[[BC]] : vector<8x[4]x2x3xf32>
+func.func @xfer_read_minor_identitiy_bcast_dims_with_mask_scalable(
+    %mem: memref<?x?x?x?xf32>,
+    %mask: vector<[4]x3xi1>,
+    %idx: index) -> vector<8x[4]x2x3xf32> {
+
+  %pad = arith.constant 0.000000e+00 : f32
+
+  %res = vector.transfer_read %mem[%idx, %idx, %idx, %idx], %pad, %mask {
+    in_bounds = [true, true, true, true],
+    permutation_map = affine_map<(d0, d1, d2, d3) -> (0, d1, 0, d3)>
+  } : memref<?x?x?x?xf32>, vector<8x[4]x2x3xf32>
+
+  return %res : vector<8x[4]x2x3xf32>
+}
+
 // Masked version is not supported
 
 // CHECK-LABEL:   func.func @xfer_read_minor_identitiy_bcast_dims_masked
 //  CHECK-SAME:     %[[MEM:.*]]: memref<?x?x?x?xf32>,
-//  CHECK-SAME:     %[[MASK:.*]]: vector<[4]x3xi1>
-//  CHECK-SAME:     %[[IDX:.*]]: index) -> vector<8x[4]x2x3xf32> {
+//  CHECK-SAME:     %[[MASK:.*]]: vector<4x3xi1>
+//  CHECK-SAME:     %[[IDX:.*]]: index) -> vector<8x4x2x3xf32> {
 //   CHECK-NOT:     vector.broadcast
-//       CHECK:     vector.mask %[[MASK]] { vector.transfer_read %[[MEM]]{{.*}} : memref<?x?x?x?xf32>, vector<8x[4]x2x3xf32> } : vector<[4]x3xi1> -> vector<8x[4]x2x3xf32>
+//       CHECK:     vector.mask %[[MASK]] { vector.transfer_read %[[MEM]]{{.*}} : memref<?x?x?x?xf32>, vector<8x4x2x3xf32> } : vector<4x3xi1> -> vector<8x4x2x3xf32>
 func.func @xfer_read_minor_identitiy_bcast_dims_masked(
     %mem: memref<?x?x?x?xf32>,
-    %mask: vector<[4]x3xi1>,
-    %idx: index) -> vector<8x[4]x2x3xf32> {
+    %mask: vector<4x3xi1>,
+    %idx: index) -> vector<8x4x2x3xf32> {
 
   %pad = arith.constant 0.000000e+00 : f32
 
@@ -404,12 +472,15 @@ func.func @xfer_read_minor_identitiy_bcast_dims_masked(
     vector.transfer_read %mem[%idx, %idx, %idx, %idx], %pad {
       in_bounds = [true, true, true, true],
       permutation_map = affine_map<(d0, d1, d2, d3) -> (0, d1, 0, d3)>
-    } : memref<?x?x?x?xf32>, vector<8x[4]x2x3xf32>
-  } : vector<[4]x3xi1> -> vector<8x[4]x2x3xf32>
+    } : memref<?x?x?x?xf32>, vector<8x4x2x3xf32>
+  } : vector<4x3xi1> -> vector<8x4x2x3xf32>
 
-  return %res : vector<8x[4]x2x3xf32>
+  return %res : vector<8x4x2x3xf32>
 }
 
+///----------------------------------------------------------------------------------------
+//  TD sequence
+///----------------------------------------------------------------------------------------
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
     %f = transform.structured.match ops{["func.func"]} in %module_op
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index cf18c07..e4f2623 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -3270,3 +3270,91 @@ llvm.func @omp_task_if(%boolexpr: i1) {
 // -----
 
 module attributes {omp.requires = #omp<clause_requires reverse_offload|unified_shared_memory>} {}
+
+// -----
+
+llvm.func @distribute() {
+  %0 = llvm.mlir.constant(42 : index) : i64
+  %1 = llvm.mlir.constant(10 : index) : i64
+  %2 = llvm.mlir.constant(1 : index) : i64
+  omp.distribute {
+    omp.loop_nest (%arg1) : i64 = (%1) to (%0) step (%2) {
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define void @distribute
+// CHECK:         call void @[[OUTLINED:.*]]({{.*}})
+// CHECK-NEXT:    br label %[[EXIT:.*]]
+// CHECK:       [[EXIT]]:
+// CHECK:         ret void
+
+// CHECK:       define internal void @[[OUTLINED]]({{.*}})
+// CHECK:         %[[LASTITER:.*]] = alloca i32
+// CHECK:         %[[LB:.*]] = alloca i64
+// CHECK:         %[[UB:.*]] = alloca i64
+// CHECK:         %[[STRIDE:.*]] = alloca i64
+// CHECK:         br label %[[BODY:.*]]
+// CHECK:       [[BODY]]:
+// CHECK-NEXT:    br label %[[REGION:.*]]
+// CHECK:       [[REGION]]:
+// CHECK-NEXT:    br label %[[PREHEADER:.*]]
+// CHECK:       [[PREHEADER]]:
+// CHECK:         store i64 0, ptr %[[LB]]
+// CHECK:         store i64 31, ptr %[[UB]]
+// CHECK:         store i64 1, ptr %[[STRIDE]]
+// CHECK:         %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
+// CHECK:         call void @__kmpc_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 92, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[STRIDE]], i64 1, i64 0)
+
+// -----
+
+llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
+  omp.parallel {
+    omp.distribute {
+      omp.wsloop {
+        omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+          omp.yield
+        }
+      } {omp.composite}
+    } {omp.composite}
+    omp.terminator
+  } {omp.composite}
+  llvm.return
+}
+
+// CHECK-LABEL: define void @distribute_wsloop
+// CHECK:         call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[OUTLINED_PARALLEL:.*]],
+
+// CHECK:       define internal void @[[OUTLINED_PARALLEL]]
+// CHECK:         call void @[[OUTLINED_DISTRIBUTE:.*]]({{.*}})
+
+// CHECK:       define internal void @[[OUTLINED_DISTRIBUTE]]
+// CHECK:         %[[LASTITER:.*]] = alloca i32
+// CHECK:         %[[LB:.*]] = alloca i32
+// CHECK:         %[[UB:.*]] = alloca i32
+// CHECK:         %[[STRIDE:.*]] = alloca i32
+// CHECK:         br label %[[AFTER_ALLOCA:.*]]
+
+// CHECK:       [[AFTER_ALLOCA]]:
+// CHECK:         br label %[[DISTRIBUTE_BODY:.*]]
+
+// CHECK:       [[DISTRIBUTE_BODY]]:
+// CHECK-NEXT:    br label %[[DISTRIBUTE_REGION:.*]]
+
+// CHECK:       [[DISTRIBUTE_REGION]]:
+// CHECK-NEXT:    br label %[[WSLOOP_REGION:.*]]
+
+// CHECK:       [[WSLOOP_REGION]]:
+// CHECK:         %omp_loop.tripcount = select {{.*}}
+// CHECK-NEXT:    br label %[[PREHEADER:.*]]
+
+// CHECK:       [[PREHEADER]]:
+// CHECK:         store i32 0, ptr %[[LB]]
+// CHECK:         %[[TRIPCOUNT:.*]] = sub i32 %omp_loop.tripcount, 1
+// CHECK:         store i32 %[[TRIPCOUNT]], ptr %[[UB]]
+// CHECK:         store i32 1, ptr %[[STRIDE]]
+// CHECK:         %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
+// CHECK:         %[[DIST_UB:.*]] = alloca i32
+// CHECK:         call void @__kmpc_dist_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 34, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[DIST_UB]], ptr %[[STRIDE]], i32 1, i32 0)
diff --git a/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir b/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir
new file mode 100644
index 0000000..7930554
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir
@@ -0,0 +1,96 @@
+// RUN: split-file %s %t
+// RUN: mlir-translate -mlir-to-llvmir %t/host.mlir | FileCheck %s --check-prefix=HOST
+// RUN: mlir-translate -mlir-to-llvmir %t/device.mlir | FileCheck %s --check-prefix=DEVICE
+
+//--- host.mlir
+
+module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @main(%x : i32) {
+    omp.target host_eval(%x -> %lb, %x -> %ub, %x -> %step : i32, i32, i32) {
+      omp.teams {
+        omp.parallel {
+          omp.distribute {
+            omp.wsloop {
+              omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+                omp.yield
+              }
+            } {omp.composite}
+          } {omp.composite}
+          omp.terminator
+        } {omp.composite}
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// HOST-LABEL: define void @main
+// HOST:         %omp_loop.tripcount = {{.*}}
+// HOST-NEXT:    br label %[[ENTRY:.*]]
+// HOST:       [[ENTRY]]:
+// HOST-NEXT:    %[[TRIPCOUNT:.*]] = zext i32 %omp_loop.tripcount to i64
+// HOST:         %[[TRIPCOUNT_KARG:.*]] = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %[[KARGS:.*]], i32 0, i32 8
+// HOST-NEXT:    store i64 %[[TRIPCOUNT]], ptr %[[TRIPCOUNT_KARG]]
+// HOST:         %[[RESULT:.*]] = call i32 @__tgt_target_kernel({{.*}}, ptr %[[KARGS]])
+// HOST-NEXT:    %[[CMP:.*]] = icmp ne i32 %[[RESULT]], 0
+// HOST-NEXT:    br i1 %[[CMP]], label %[[OFFLOAD_FAILED:.*]], label %{{.*}}
+// HOST:       [[OFFLOAD_FAILED]]:
+// HOST:         call void @[[TARGET_OUTLINE:.*]]({{.*}})
+
+// HOST:       define internal void @[[TARGET_OUTLINE]]
+// HOST:         call void{{.*}}@__kmpc_fork_teams({{.*}}, ptr @[[TEAMS_OUTLINE:.*]], {{.*}})
+
+// HOST:       define internal void @[[TEAMS_OUTLINE]]
+// HOST:         call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], {{.*}})
+
+// HOST:       define internal void @[[PARALLEL_OUTLINE]]
+// HOST:         call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}})
+
+// HOST:       define internal void @[[DISTRIBUTE_OUTLINE]]
+// HOST:         call void @__kmpc_dist_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 34, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}})
+
+//--- device.mlir
+
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true, omp.is_gpu = true} {
+  llvm.func @main(%x : i32) {
+    omp.target host_eval(%x -> %lb, %x -> %ub, %x -> %step : i32, i32, i32) {
+      omp.teams {
+        omp.parallel {
+          omp.distribute {
+            omp.wsloop {
+              omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+                omp.yield
+              }
+            } {omp.composite}
+          } {omp.composite}
+          omp.terminator
+        } {omp.composite}
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// DEVICE:      @[[KERNEL_NAME:.*]]_exec_mode = weak protected constant i8 2
+// DEVICE:      @llvm.compiler.used = appending global [1 x ptr] [ptr @[[KERNEL_NAME]]_exec_mode], section "llvm.metadata"
+// DEVICE:      @[[KERNEL_NAME]]_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy {
+// DEVICE-SAME: %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 [[EXEC_MODE:2]], {{.*}}},
+// DEVICE-SAME: ptr @{{.*}}, ptr @{{.*}} }
+
+// DEVICE:      define weak_odr protected amdgpu_kernel void @[[KERNEL_NAME]]({{.*}})
+// DEVICE:        %{{.*}} = call i32 @__kmpc_target_init(ptr @[[KERNEL_NAME]]_kernel_environment, {{.*}})
+// DEVICE:        call void @[[TARGET_OUTLINE:.*]]({{.*}})
+// DEVICE:        call void @__kmpc_target_deinit()
+
+// DEVICE:      define internal void @[[TARGET_OUTLINE]]({{.*}})
+// DEVICE:        call void @__kmpc_parallel_51(ptr {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], ptr {{.*}}, ptr {{.*}}, i64 {{.*}})
+
+// DEVICE:      define internal void @[[PARALLEL_OUTLINE]]({{.*}})
+// DEVICE:        call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}})
+
+// DEVICE:      define internal void @[[DISTRIBUTE_OUTLINE]]({{.*}})
+// DEVICE:        call void @__kmpc_distribute_for_static_loop{{.*}}({{.*}})
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index e97b5e5..f907bb3 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -66,10 +66,51 @@ llvm.func @do_simd(%lb : i32, %ub : i32, %step : i32) {
 
 // -----
 
-llvm.func @distribute(%lb : i32, %ub : i32, %step : i32) {
-  // expected-error@below {{not yet implemented: omp.distribute}}
+llvm.func @distribute_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+  // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.distribute operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
-  omp.distribute {
+  omp.distribute allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// -----
+
+llvm.func @distribute_dist_schedule(%lb : i32, %ub : i32, %step : i32, %x : i32) {
+  // expected-error@below {{not yet implemented: Unhandled clause dist_schedule with chunk_size in omp.distribute operation}}
+  // expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
+  omp.distribute dist_schedule_static dist_schedule_chunk_size(%x : i32) {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// -----
+
+llvm.func @distribute_order(%lb : i32, %ub : i32, %step : i32) {
+  // expected-error@below {{not yet implemented: Unhandled clause order in omp.distribute operation}}
+  // expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
+  omp.distribute order(concurrent) {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// -----
+
+omp.private {type = private} @x.privatizer : !llvm.ptr
+
+llvm.func @distribute_private(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+  // expected-error@below {{not yet implemented: Unhandled clause privatization in omp.distribute operation}}
+  // expected-error@below {{LLVM Translation failed for operation: omp.distribute}}
+  omp.distribute private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
     omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
       omp.yield
     }
@@ -278,30 +319,6 @@ llvm.func @target_has_device_addr(%x : !llvm.ptr) {
 
 // -----
 
-llvm.func @target_host_eval(%x : i32) {
-  // expected-error@below {{not yet implemented: host evaluation of loop bounds in omp.target operation}}
-  // expected-error@below {{LLVM Translation failed for operation: omp.target}}
-  omp.target host_eval(%x -> %lb, %x -> %ub, %x -> %step : i32, i32, i32) {
-    omp.teams {
-      omp.parallel {
-        omp.distribute {
-          omp.wsloop {
-            omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
-              omp.yield
-            }
-          } {omp.composite}
-        } {omp.composite}
-        omp.terminator
-      } {omp.composite}
-      omp.terminator
-    }
-    omp.terminator
-  }
-  llvm.return
-}
-
-// -----
-
 omp.declare_reduction @add_f32 : f32
 init {
 ^bb0(%arg: f32):
diff --git a/mlir/test/Target/SPIRV/image-ops.mlir b/mlir/test/Target/SPIRV/image-ops.mlir
index 6b52a84..3c28c3f 100644
--- a/mlir/test/Target/SPIRV/image-ops.mlir
+++ b/mlir/test/Target/SPIRV/image-ops.mlir
@@ -1,11 +1,11 @@
 // RUN: mlir-translate --no-implicit-module --split-input-file --test-spirv-roundtrip %s | FileCheck %s
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ImageQuery], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ImageQuery, Linkage], []> {
   spirv.func @image(%arg0 : !spirv.sampled_image<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32) "None" {
     // CHECK: {{%.*}} = spirv.Image {{%.*}} : !spirv.sampled_image<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Unknown>>
     %0 = spirv.Image %arg0 : !spirv.sampled_image<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Unknown>>
-    // CHECK: {{%.*}} = spirv.ImageDrefGather {{%.*}} : !spirv.sampled_image<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Unknown>>,  {{%.*}} : vector<4xf32>,  {{%.*}} : f32 -> vector<4xf32>
-    %1 = spirv.ImageDrefGather %arg0 : !spirv.sampled_image<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Unknown>>, %arg1 : vector<4xf32>, %arg2 : f32 -> vector<4xf32>
+    // CHECK: {{%.*}} = spirv.ImageDrefGather {{%.*}}, {{%.*}}, {{%.*}} : !spirv.sampled_image<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Unknown>>,  vector<4xf32>, f32 -> vector<4xf32>
+    %1 = spirv.ImageDrefGather %arg0, %arg1, %arg2 : !spirv.sampled_image<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Unknown>>, vector<4xf32>, f32 -> vector<4xf32>
     spirv.Return
   }
   spirv.func @image_query_size(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>) "None" {
@@ -14,26 +14,18 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ImageQuery], []>
     spirv.Return
   }
   spirv.func @image_write(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>) "None" {
-    // CHECK:  spirv.ImageWrite %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>
-    spirv.ImageWrite %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>
+    // CHECK: spirv.ImageWrite {{%.*}}, {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32>, vector<4xf32>
+    spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32>, vector<4xf32>
     spirv.Return
   }
-  spirv.func @main() "None" {
-    spirv.Return
-  }
-  spirv.EntryPoint "GLCompute" @main
 }
 
 // -----
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, StorageImageWriteWithoutFormat], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, StorageImageWriteWithoutFormat, Linkage], []> {
   spirv.func @image_write(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>) "None" {
-    // CHECK:  spirv.ImageWrite %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>
-    spirv.ImageWrite %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>
-    spirv.Return
-  }
-  spirv.func @main() "None" {
+    // CHECK: spirv.ImageWrite {{%.*}}, {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32>, vector<4xf32>
+    spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32>, vector<4xf32>
     spirv.Return
   }
-  spirv.EntryPoint "GLCompute" @main
 }
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index c048f8b..f1c3165 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -148,8 +148,8 @@ def TestType : Test_Type<"Test", [
 }
 
 def TestTypeWithLayoutType : Test_Type<"TestTypeWithLayout", [
-  DeclareTypeInterfaceMethods<DataLayoutTypeInterface, ["getIndexBitwidth",
-                                                        "areCompatible"]>
+  DeclareTypeInterfaceMethods<DataLayoutTypeInterface,
+    ["getIndexBitwidth", "areCompatible", "getPreferredAlignment"]>
 ]> {
   let mnemonic = "test_type_with_layout";
   let parameters = (ins "unsigned":$key);
diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp
index d765807..bb60fc6 100644
--- a/openmp/tools/archer/ompt-tsan.cpp
+++ b/openmp/tools/archer/ompt-tsan.cpp
@@ -166,9 +166,6 @@ DECLARE_TSAN_FUNCTION(AnnotateNewMemory, const char *, int,
                       const volatile void *, size_t)
 DECLARE_TSAN_FUNCTION(__tsan_func_entry, const void *)
 DECLARE_TSAN_FUNCTION(__tsan_func_exit)
-
-// RunningOnValgrind is used to detect absence of TSan and must intentionally be a nullptr.
-static int (*RunningOnValgrind)(void);
 }
 
 // This marker is used to define a happens-before arc. The race detector will
@@ -1252,13 +1249,15 @@ ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
 
   // The OMPT start-up code uses dlopen with RTLD_LAZY. Therefore, we cannot
   // rely on dlopen to fail if TSan is missing, but would get a runtime error
-  // for the first TSan call. We use RunningOnValgrind to detect whether
+  // for the first TSan call. We use __tsan_init to detect whether
   // an implementation of the Annotation interface is available in the
   // execution or disable the tool (by returning NULL).
 
-  findTsanFunctionSilent(RunningOnValgrind, (int (*)(void)));
-  if (!RunningOnValgrind) // if we are not running on TSAN, give a different
-                          // tool the chance to be loaded
+  void (*__tsan_init)(void) = nullptr;
+
+  findTsanFunctionSilent(__tsan_init, (void (*)(void)));
+  if (!__tsan_init) // if we are not running on TSAN, give a different
+                    // tool the chance to be loaded
   {
     if (archer_flags->verbose)
       std::cout << "Archer detected OpenMP application without TSan; "
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index ad58428..3cbde42 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -370,6 +370,40 @@ cc_library(
     ],
 )
 
+td_library(
+    name = "RelayoutOpInterfaceTdFiles",
+    srcs = ["include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.td"],
+    includes = ["include"],
+    deps = [
+        ":DestinationStyleOpInterfaceTdFiles"
+    ],
+)
+
+gentbl_cc_library(
+    name = "RelayoutOpInterfaceIncGen",
+    tbl_outs = [
+        (
+            ["-gen-op-interface-decls"],
+            "include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.h.inc",
+        ),
+        (
+            ["-gen-op-interface-defs"],
+            "include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.td",
+    deps = [
+        ":RelayoutOpInterfaceTdFiles",
+    ],
+)
+
+cc_library(
+    name = "RelayoutOpInterface",
+    hdrs = ["include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.h"],
+    deps = [":RelayoutOpInterfaceIncGen"],
+)
+
 cc_library(
     name = "IR",
     srcs = glob([
@@ -3910,6 +3944,7 @@ cc_library(
         ":MaskableOpInterface",
         ":ShapedOpInterfaces",
         ":SideEffectInterfaces",
+        ":Support",
         ":VectorDialect",
         ":ViewLikeInterface",
         ":XeGPUEnumsIncGen",
@@ -7857,9 +7892,9 @@ cc_library(
         ":InferIntRangeInterface",
         ":InferTypeOpInterface",
         ":InliningUtils",
-        ":LinalgInterfaces",
         ":LoopLikeInterface",
         ":ParallelCombiningOpInterface",
+        ":RelayoutOpInterface",
         ":ShapedOpInterfaces",
         ":SideEffectInterfaces",
         ":SubsetOpInterface",
@@ -11140,6 +11175,7 @@ td_library(
         "include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td",
         "include/mlir/Dialect/Linalg/IR/LinalgOps.td",
         "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td",
+        "include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.td",
     ],
     includes = ["include"],
     deps = [
@@ -11152,6 +11188,7 @@ td_library(
         ":SideEffectInterfacesTdFiles",
         ":TilingInterfaceTdFiles",
         ":ViewLikeInterfaceTdFiles",
+        ":RelayoutOpInterfaceIncGen",
     ],
 )
 
@@ -11563,6 +11600,7 @@ cc_library(
         ":InferTypeOpInterface",
         ":LinalgInterfacesIncGen",
         ":LinalgStructuredOpsIncGen",
+        ":RelayoutOpInterfaceIncGen",
         ":Support",
         ":ViewLikeInterface",
     ],
@@ -11603,6 +11641,7 @@ cc_library(
         ":LinalgNamedStructuredOpsYamlIncGen",
         ":LinalgOpsIncGen",
         ":LinalgRelayoutOpsIncGen",
+        ":RelayoutOpInterfaceIncGen",
         ":LinalgStructuredOpsIncGen",
         ":MathDialect",
         ":MemRefDialect",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/Target/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/Target/BUILD.bazel
index 5dd22ea..7fab1ea 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/Target/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/Target/BUILD.bazel
@@ -9,6 +9,7 @@ package(default_visibility = ["//visibility:public"])
         name = "%s.test" % src,
         srcs = [src],
         data = [
+            "//llvm:split-file",
             "//mlir:mlir-opt",
             "//mlir:mlir-translate",
             "//mlir/test:lit_data",
author	Vitaly Buka <vitalybuka@google.com>	2025-02-25 15:08:54 -0800
committer	Vitaly Buka <vitalybuka@google.com>	2025-02-25 15:08:54 -0800
commit	8fc4020c788ab93eb0ff82bda3db87ac80065674 (patch)
tree	aa87a699129cd68757a8a6bd20fcb42e44755f36
parent	250d83255545f68afe144a47eb985ea355e3ea44 (diff)
parent	30a7c816ee5ca998da960c6ab98e72903de40592 (diff)
download	llvm-users/vitalybuka/spr/main.ltopipelinescoro-de-duplicate-coro-passes.zip llvm-users/vitalybuka/spr/main.ltopipelinescoro-de-duplicate-coro-passes.tar.gz llvm-users/vitalybuka/spr/main.ltopipelinescoro-de-duplicate-coro-passes.tar.bz2