aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/benchmarks/SpecialCaseListBM.cpp29
-rw-r--r--llvm/include/llvm/ADT/Bitfields.h9
-rw-r--r--llvm/include/llvm/ADT/DenseMap.h2
-rw-r--r--llvm/include/llvm/ADT/DepthFirstIterator.h18
-rw-r--r--llvm/include/llvm/ADT/ImmutableSet.h6
-rw-r--r--llvm/include/llvm/ADT/PostOrderIterator.h6
-rw-r--r--llvm/include/llvm/ADT/STLExtras.h2
-rw-r--r--llvm/include/llvm/ADT/SmallPtrSet.h12
-rw-r--r--llvm/include/llvm/ADT/bit.h15
-rw-r--r--llvm/include/llvm/Support/Alignment.h2
-rw-r--r--llvm/include/llvm/Support/Casting.h7
-rw-r--r--llvm/include/llvm/Support/CommandLine.h2
-rw-r--r--llvm/include/llvm/Support/DOTGraphTraits.h3
-rw-r--r--llvm/include/llvm/Support/ELFAttributes.h2
-rw-r--r--llvm/include/llvm/Support/LSP/Protocol.h2
-rw-r--r--llvm/include/llvm/Support/MD5.h2
-rw-r--r--llvm/include/llvm/Support/Timer.h2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h9
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h24
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp96
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/abs_i32.ll92
-rw-r--r--llvm/test/CodeGen/AMDGPU/addsub64_carry.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll1260
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll135
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll210
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll585
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll614
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll35
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/optimize-compare.mir82
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_cmp_0.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem.ll654
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem64.ll207
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv64.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/urem64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll190
-rw-r--r--llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll8
-rw-r--r--llvm/test/Transforms/SCCP/constant-range-struct.ll65
-rw-r--r--llvm/unittests/ADT/BitTest.cpp16
64 files changed, 3465 insertions, 2511 deletions
diff --git a/llvm/benchmarks/SpecialCaseListBM.cpp b/llvm/benchmarks/SpecialCaseListBM.cpp
index 00aa3cd..b5d8268 100644
--- a/llvm/benchmarks/SpecialCaseListBM.cpp
+++ b/llvm/benchmarks/SpecialCaseListBM.cpp
@@ -110,6 +110,26 @@ std::string genGlobAtBothSides(const std::vector<std::string> &Files) {
return S;
}
+std::string genGlobAtBothSidesAndMid(const std::vector<std::string> &Files) {
+ std::string S;
+ std::minstd_rand Rng(RNG_SEED);
+ for (std::string F : Files) {
+ std::uniform_int_distribution<> PosDistrib(0, F.size() - 1);
+ F[PosDistrib(Rng)] = '*';
+
+ std::uniform_int_distribution<> Ends(0, 1);
+ if (Ends(Rng)) {
+ F.back() = '*';
+ F.front() = '*';
+ }
+
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
void BM_Make_(
benchmark::State &state,
std::string (*GenerateCaseList)(const std::vector<std::string> &Files)) {
@@ -171,6 +191,9 @@ BENCHMARK_CAPTURE(BM_Make_, Mid__, genGlobInMid)
BENCHMARK_CAPTURE(BM_Make_, Both_, genGlobAtBothSides)
->RangeMultiplier(MAX_LIST_MUL)
->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, Mix__, genGlobAtBothSidesAndMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
BENCHMARK_CAPTURE(BM_True_, None_, genGlobNone)
->RangeMultiplier(MAX_LIST_MUL)
@@ -187,6 +210,9 @@ BENCHMARK_CAPTURE(BM_True_, Mid__, genGlobInMid)
BENCHMARK_CAPTURE(BM_True_, Both_, genGlobAtBothSides)
->RangeMultiplier(MAX_LIST_MUL)
->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, Mix__, genGlobAtBothSidesAndMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
BENCHMARK_CAPTURE(BM_False, None_, genGlobNone)
->RangeMultiplier(MAX_LIST_MUL)
@@ -203,5 +229,8 @@ BENCHMARK_CAPTURE(BM_False, Mid__, genGlobInMid)
BENCHMARK_CAPTURE(BM_False, Both_, genGlobAtBothSides)
->RangeMultiplier(MAX_LIST_MUL)
->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, Mix__, genGlobAtBothSidesAndMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
BENCHMARK_MAIN();
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 1af2761..1fbc41c 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -154,12 +154,9 @@ struct ResolveUnderlyingType {
using type = std::underlying_type_t<T>;
};
template <typename T> struct ResolveUnderlyingType<T, false> {
- using type = T;
-};
-template <> struct ResolveUnderlyingType<bool, false> {
- /// In case sizeof(bool) != 1, replace `void` by an additionnal
- /// std::conditional.
- using type = std::conditional_t<sizeof(bool) == 1, uint8_t, void>;
+ static_assert(!std::is_same_v<T, bool> || sizeof(bool) == 1,
+ "T being bool requires sizeof(bool) == 1.");
+ using type = std::conditional_t<std::is_same_v<T, bool>, uint8_t, T>;
};
} // namespace bitfields_details
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 4bda50f..25b5262 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -42,7 +42,7 @@ namespace detail {
// We extend a pair to allow users to override the bucket type with their own
// implementation without requiring two members.
template <typename KeyT, typename ValueT>
-struct DenseMapPair : public std::pair<KeyT, ValueT> {
+struct DenseMapPair : std::pair<KeyT, ValueT> {
using std::pair<KeyT, ValueT>::pair;
KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; }
diff --git a/llvm/include/llvm/ADT/DepthFirstIterator.h b/llvm/include/llvm/ADT/DepthFirstIterator.h
index 4ced758..3c54f32 100644
--- a/llvm/include/llvm/ADT/DepthFirstIterator.h
+++ b/llvm/include/llvm/ADT/DepthFirstIterator.h
@@ -66,8 +66,8 @@ public:
// one more method, completed, which is invoked when all children of a
// node have been processed. It is intended to distinguish of back and
// cross edges in the spanning tree but is not used in the common case.
-template <typename NodeRef, unsigned SmallSize=8>
-struct df_iterator_default_set : public SmallPtrSet<NodeRef, SmallSize> {
+template <typename NodeRef, unsigned SmallSize = 8>
+struct df_iterator_default_set : SmallPtrSet<NodeRef, SmallSize> {
using BaseSet = SmallPtrSet<NodeRef, SmallSize>;
using iterator = typename BaseSet::iterator;
@@ -235,8 +235,10 @@ iterator_range<df_iterator<T>> depth_first(const T& G) {
}
// Provide global definitions of external depth first iterators...
-template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
-struct df_ext_iterator : public df_iterator<T, SetTy, true> {
+template <class T,
+ class SetTy =
+ df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
+struct df_ext_iterator : df_iterator<T, SetTy, true> {
df_ext_iterator(const df_iterator<T, SetTy, true> &V)
: df_iterator<T, SetTy, true>(V) {}
};
@@ -262,7 +264,7 @@ template <class T,
class SetTy =
df_iterator_default_set<typename GraphTraits<T>::NodeRef>,
bool External = false>
-struct idf_iterator : public df_iterator<Inverse<T>, SetTy, External> {
+struct idf_iterator : df_iterator<Inverse<T>, SetTy, External> {
idf_iterator(const df_iterator<Inverse<T>, SetTy, External> &V)
: df_iterator<Inverse<T>, SetTy, External>(V) {}
};
@@ -284,8 +286,10 @@ iterator_range<idf_iterator<T>> inverse_depth_first(const T& G) {
}
// Provide global definitions of external inverse depth first iterators...
-template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
-struct idf_ext_iterator : public idf_iterator<T, SetTy, true> {
+template <class T,
+ class SetTy =
+ df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
+struct idf_ext_iterator : idf_iterator<T, SetTy, true> {
idf_ext_iterator(const idf_iterator<T, SetTy, true> &V)
: idf_iterator<T, SetTy, true>(V) {}
idf_ext_iterator(const df_iterator<Inverse<T>, SetTy, true> &V)
diff --git a/llvm/include/llvm/ADT/ImmutableSet.h b/llvm/include/llvm/ADT/ImmutableSet.h
index 310539f..8b2425e 100644
--- a/llvm/include/llvm/ADT/ImmutableSet.h
+++ b/llvm/include/llvm/ADT/ImmutableSet.h
@@ -931,8 +931,7 @@ struct ImutProfileInfo<T*> {
/// ImutContainerInfo - Generic definition of comparison operations for
/// elements of immutable containers that defaults to using
/// std::equal_to<> and std::less<> to perform comparison of elements.
-template <typename T>
-struct ImutContainerInfo : public ImutProfileInfo<T> {
+template <typename T> struct ImutContainerInfo : ImutProfileInfo<T> {
using value_type = typename ImutProfileInfo<T>::value_type;
using value_type_ref = typename ImutProfileInfo<T>::value_type_ref;
using key_type = value_type;
@@ -957,8 +956,7 @@ struct ImutContainerInfo : public ImutProfileInfo<T> {
/// ImutContainerInfo - Specialization for pointer values to treat pointers
/// as references to unique objects. Pointers are thus compared by
/// their addresses.
-template <typename T>
-struct ImutContainerInfo<T*> : public ImutProfileInfo<T*> {
+template <typename T> struct ImutContainerInfo<T *> : ImutProfileInfo<T *> {
using value_type = typename ImutProfileInfo<T*>::value_type;
using value_type_ref = typename ImutProfileInfo<T*>::value_type_ref;
using key_type = value_type;
diff --git a/llvm/include/llvm/ADT/PostOrderIterator.h b/llvm/include/llvm/ADT/PostOrderIterator.h
index 1cbd3c1..d9aa452 100644
--- a/llvm/include/llvm/ADT/PostOrderIterator.h
+++ b/llvm/include/llvm/ADT/PostOrderIterator.h
@@ -200,7 +200,7 @@ template <class T> iterator_range<po_iterator<T>> post_order(const T &G) {
// Provide global definitions of external postorder iterators...
template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>>
-struct po_ext_iterator : public po_iterator<T, SetType, true> {
+struct po_ext_iterator : po_iterator<T, SetType, true> {
po_ext_iterator(const po_iterator<T, SetType, true> &V) :
po_iterator<T, SetType, true>(V) {}
};
@@ -223,7 +223,7 @@ iterator_range<po_ext_iterator<T, SetType>> post_order_ext(const T &G, SetType &
// Provide global definitions of inverse post order iterators...
template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>,
bool External = false>
-struct ipo_iterator : public po_iterator<Inverse<T>, SetType, External> {
+struct ipo_iterator : po_iterator<Inverse<T>, SetType, External> {
ipo_iterator(const po_iterator<Inverse<T>, SetType, External> &V) :
po_iterator<Inverse<T>, SetType, External> (V) {}
};
@@ -245,7 +245,7 @@ iterator_range<ipo_iterator<T>> inverse_post_order(const T &G) {
// Provide global definitions of external inverse postorder iterators...
template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>>
-struct ipo_ext_iterator : public ipo_iterator<T, SetType, true> {
+struct ipo_ext_iterator : ipo_iterator<T, SetType, true> {
ipo_ext_iterator(const ipo_iterator<T, SetType, true> &V) :
ipo_iterator<T, SetType, true>(V) {}
ipo_ext_iterator(const po_iterator<Inverse<T>, SetType, true> &V) :
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 658f262..a9841c6 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -674,7 +674,7 @@ using zip_traits = iterator_facade_base<
ReferenceTupleType *, ReferenceTupleType>;
template <typename ZipType, typename ReferenceTupleType, typename... Iters>
-struct zip_common : public zip_traits<ZipType, ReferenceTupleType, Iters...> {
+struct zip_common : zip_traits<ZipType, ReferenceTupleType, Iters...> {
using Base = zip_traits<ZipType, ReferenceTupleType, Iters...>;
using IndexSequence = std::index_sequence_for<Iters...>;
using value_type = typename Base::value_type;
diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index f588a77..8e7c8b3 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -532,18 +532,8 @@ class SmallPtrSet : public SmallPtrSetImpl<PtrType> {
using BaseT = SmallPtrSetImpl<PtrType>;
- // A constexpr version of llvm::bit_ceil.
- // TODO: Replace this with std::bit_ceil once C++20 is available.
- static constexpr size_t RoundUpToPowerOfTwo(size_t X) {
- size_t C = 1;
- size_t CMax = C << (std::numeric_limits<size_t>::digits - 1);
- while (C < X && C < CMax)
- C <<= 1;
- return C;
- }
-
// Make sure that SmallSize is a power of two, round up if not.
- static constexpr size_t SmallSizePowTwo = RoundUpToPowerOfTwo(SmallSize);
+ static constexpr size_t SmallSizePowTwo = llvm::bit_ceil_constexpr(SmallSize);
/// SmallStorage - Fixed size storage used in 'small mode'.
const void *SmallStorage[SmallSizePowTwo];
diff --git a/llvm/include/llvm/ADT/bit.h b/llvm/include/llvm/ADT/bit.h
index 8c68d0a..8b60b69 100644
--- a/llvm/include/llvm/ADT/bit.h
+++ b/llvm/include/llvm/ADT/bit.h
@@ -336,6 +336,21 @@ template <typename T> [[nodiscard]] T bit_ceil(T Value) {
return T(1) << llvm::bit_width<T>(Value - 1u);
}
+/// Returns the smallest integral power of two no smaller than Value if Value is
+/// nonzero. Returns 1 otherwise.
+///
+/// Ex. bit_ceil(5) == 8.
+///
+/// The return value is undefined if the input is larger than the largest power
+/// of two representable in T.
+template <typename T> [[nodiscard]] constexpr T bit_ceil_constexpr(T Value) {
+ static_assert(std::is_unsigned_v<T>,
+ "Only unsigned integral types are allowed.");
+ if (Value < 2)
+ return 1;
+ return T(1) << llvm::bit_width_constexpr<T>(Value - 1u);
+}
+
template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
[[nodiscard]] constexpr T rotl(T V, int R) {
constexpr unsigned N = std::numeric_limits<T>::digits;
diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h
index a4ca54e..f9d7c76 100644
--- a/llvm/include/llvm/Support/Alignment.h
+++ b/llvm/include/llvm/Support/Alignment.h
@@ -103,7 +103,7 @@ inline Align assumeAligned(uint64_t Value) {
/// This struct is a compact representation of a valid (power of two) or
/// undefined (0) alignment.
-struct MaybeAlign : public std::optional<Align> {
+struct MaybeAlign : std::optional<Align> {
private:
using UP = std::optional<Align>;
diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 2a9a149..6f6df2e 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -340,7 +340,7 @@ struct ValueFromPointerCast
/// during the cast. It's also a good example of how to implement a move-only
/// cast.
template <typename To, typename From, typename Derived = void>
-struct UniquePtrCast : public CastIsPossible<To, From *> {
+struct UniquePtrCast : CastIsPossible<To, From *> {
using Self = detail::SelfType<Derived, UniquePtrCast<To, From>>;
using CastResultType = std::unique_ptr<
std::remove_reference_t<typename cast_retty<To, From>::ret_type>>;
@@ -473,7 +473,7 @@ struct ForwardToPointerCast {
// take advantage of the cast traits whenever possible!
template <typename To, typename From, typename Enable = void>
-struct CastInfo : public CastIsPossible<To, From> {
+struct CastInfo : CastIsPossible<To, From> {
using Self = CastInfo<To, From, Enable>;
using CastReturnType = typename cast_retty<To, From>::ret_type;
@@ -536,8 +536,7 @@ struct CastInfo<To, std::unique_ptr<From>> : public UniquePtrCast<To, From> {};
/// the input is std::optional<From> that the output can be std::optional<To>.
/// If that's not the case, specialize CastInfo for your use case.
template <typename To, typename From>
-struct CastInfo<To, std::optional<From>> : public OptionalValueCast<To, From> {
-};
+struct CastInfo<To, std::optional<From>> : OptionalValueCast<To, From> {};
/// isa<X> - Return true if the parameter to the template is an instance of one
/// of the template type arguments. Used like this:
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index dd05c53..5a5f00e 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -549,7 +549,7 @@ template <class DataType> struct OptionValue;
// The default value safely does nothing. Option value printing is only
// best-effort.
template <class DataType, bool isClass>
-struct OptionValueBase : public GenericOptionValue {
+struct OptionValueBase : GenericOptionValue {
// Temporary storage for argument passing.
using WrapperType = OptionValue<DataType>;
diff --git a/llvm/include/llvm/Support/DOTGraphTraits.h b/llvm/include/llvm/Support/DOTGraphTraits.h
index bf30aa4..3b9fe00 100644
--- a/llvm/include/llvm/Support/DOTGraphTraits.h
+++ b/llvm/include/llvm/Support/DOTGraphTraits.h
@@ -162,8 +162,7 @@ public:
/// graphs are converted to 'dot' graphs. When specializing, you may inherit
/// from DefaultDOTGraphTraits if you don't need to override everything.
///
-template <typename Ty>
-struct DOTGraphTraits : public DefaultDOTGraphTraits {
+template <typename Ty> struct DOTGraphTraits : DefaultDOTGraphTraits {
using DefaultDOTGraphTraits::DefaultDOTGraphTraits;
};
diff --git a/llvm/include/llvm/Support/ELFAttributes.h b/llvm/include/llvm/Support/ELFAttributes.h
index 270246f..5771a84 100644
--- a/llvm/include/llvm/Support/ELFAttributes.h
+++ b/llvm/include/llvm/Support/ELFAttributes.h
@@ -48,8 +48,6 @@ struct SubsectionAndTagToTagName {
StringRef SubsectionName;
unsigned Tag;
StringRef TagName;
- SubsectionAndTagToTagName(StringRef SN, unsigned Tg, StringRef TN)
- : SubsectionName(SN), Tag(Tg), TagName(TN) {}
};
namespace ELFAttrs {
diff --git a/llvm/include/llvm/Support/LSP/Protocol.h b/llvm/include/llvm/Support/LSP/Protocol.h
index 93b82f1..e38203a 100644
--- a/llvm/include/llvm/Support/LSP/Protocol.h
+++ b/llvm/include/llvm/Support/LSP/Protocol.h
@@ -449,7 +449,7 @@ struct ReferenceContext {
bool fromJSON(const llvm::json::Value &value, ReferenceContext &result,
llvm::json::Path path);
-struct ReferenceParams : public TextDocumentPositionParams {
+struct ReferenceParams : TextDocumentPositionParams {
ReferenceContext context;
};
diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
index ed29826..4ba3867 100644
--- a/llvm/include/llvm/Support/MD5.h
+++ b/llvm/include/llvm/Support/MD5.h
@@ -41,7 +41,7 @@ template <typename T> class ArrayRef;
class MD5 {
public:
- struct MD5Result : public std::array<uint8_t, 16> {
+ struct MD5Result : std::array<uint8_t, 16> {
LLVM_ABI SmallString<32> digest() const;
uint64_t low() const {
diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h
index 40709d4..a4ed712 100644
--- a/llvm/include/llvm/Support/Timer.h
+++ b/llvm/include/llvm/Support/Timer.h
@@ -167,7 +167,7 @@ public:
/// you to declare a new timer, AND specify the region to time, all in one
/// statement. All timers with the same name are merged. This is primarily
/// used for debugging and for hunting performance problems.
-struct NamedRegionTimer : public TimeRegion {
+struct NamedRegionTimer : TimeRegion {
LLVM_ABI explicit NamedRegionTimer(StringRef Name, StringRef Description,
StringRef GroupName,
StringRef GroupDescription,
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 979a8b0..4b22c68 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include <algorithm>
+#include <array>
namespace llvm {
@@ -45,7 +46,7 @@ struct GCNRegPressure {
return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
}
- void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+ void clear() { Value.fill(0); }
unsigned getNumRegs(RegKind Kind) const {
assert(Kind < TOTAL_KINDS);
@@ -127,9 +128,7 @@ struct GCNRegPressure {
bool less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
- bool operator==(const GCNRegPressure &O) const {
- return std::equal(&Value[0], &Value[ValueArraySize], O.Value);
- }
+ bool operator==(const GCNRegPressure &O) const { return Value == O.Value; }
bool operator!=(const GCNRegPressure &O) const {
return !(*this == O);
@@ -160,7 +159,7 @@ private:
/// Pressure for all register kinds (first all regular registers kinds, then
/// all tuple register kinds).
- unsigned Value[ValueArraySize];
+ std::array<unsigned, ValueArraySize> Value;
static unsigned getRegKind(const TargetRegisterClass *RC,
const SIRegisterInfo *STI);
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c9..09ef6ac 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -45,6 +45,9 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
+ // 32-bit ABS is legal for AMDGPU except for R600
+ setOperationAction(ISD::ABS, MVT::i32, Expand);
+
// EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
// spaces, so it is custom lowered to handle those where it isn't.
for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 942e784..50447f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10626,59 +10626,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;
- const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
- this]() -> bool {
- if (CmpValue != 0)
- return false;
-
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
- return false;
-
- bool CanOptimize = false;
-
- // For S_OP that set SCC = DST!=0, do the transformation
- //
- // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
- if (setsSCCifResultIsNonZero(*Def))
- CanOptimize = true;
-
- // s_cmp_lg_* is redundant because the SCC input value for S_CSELECT* has
- // the same value that will be calculated by s_cmp_lg_*
- //
- // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
- // imm), 0)
- if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
- Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
- bool Op1IsNonZeroImm =
- Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
- bool Op2IsZeroImm =
- Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
- if (Op1IsNonZeroImm && Op2IsZeroImm)
- CanOptimize = true;
- }
-
- if (!CanOptimize)
- return false;
-
- MachineInstr *KillsSCC = nullptr;
- for (MachineInstr &MI :
- make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
- if (MI.modifiesRegister(AMDGPU::SCC, &RI))
- return false;
- if (MI.killsRegister(AMDGPU::SCC, &RI))
- KillsSCC = &MI;
- }
-
- if (MachineOperand *SccDef =
- Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
- SccDef->setIsDead(false);
- if (KillsSCC)
- KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
- CmpInstr.eraseFromParent();
- return true;
- };
-
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
bool IsReversible, bool IsSigned) -> bool {
@@ -10753,20 +10700,16 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- MachineInstr *KillsSCC = nullptr;
- for (MachineInstr &MI :
- make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
- if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+ for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
+ I != E; ++I) {
+ if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
+ I->killsRegister(AMDGPU::SCC, &RI))
return false;
- if (MI.killsRegister(AMDGPU::SCC, &RI))
- KillsSCC = &MI;
}
MachineOperand *SccDef =
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
SccDef->setIsDead(false);
- if (KillsSCC)
- KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
CmpInstr.eraseFromParent();
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10810,7 +10753,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
- return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
+ return optimizeCmpAnd(0, 32, true, false);
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
@@ -10818,7 +10761,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
- return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
+ return optimizeCmpAnd(0, 64, true, false);
}
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index ee99a74..df27ec1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -709,30 +709,6 @@ public:
}
}
- static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
- if (!MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
- return false;
- // Compares have no result
- if (MI.isCompare())
- return false;
- switch (MI.getOpcode()) {
- default:
- return true;
- case AMDGPU::S_ADD_I32:
- case AMDGPU::S_ADD_U32:
- case AMDGPU::S_ADDC_U32:
- case AMDGPU::S_SUB_I32:
- case AMDGPU::S_SUB_U32:
- case AMDGPU::S_SUBB_U32:
- case AMDGPU::S_MIN_I32:
- case AMDGPU::S_MIN_U32:
- case AMDGPU::S_MAX_I32:
- case AMDGPU::S_MAX_U32:
- case AMDGPU::S_ADDK_I32:
- return false;
- }
- }
-
static bool isEXP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index b80c3c9..4947d03 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/Instructions.h"
@@ -760,6 +761,7 @@ private:
void handleCallArguments(CallBase &CB);
void handleExtractOfWithOverflow(ExtractValueInst &EVI,
const WithOverflowInst *WO, unsigned Idx);
+ bool isInstFullyOverDefined(Instruction &Inst);
private:
friend class InstVisitor<SCCPInstVisitor>;
@@ -1374,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
// 7. If a conditional branch has a value that is overdefined, make all
// successors executable.
void SCCPInstVisitor::visitPHINode(PHINode &PN) {
- // If this PN returns a struct, just mark the result overdefined.
- // TODO: We could do a lot better than this if code actually uses this.
- if (PN.getType()->isStructTy())
- return (void)markOverdefined(&PN);
-
- if (getValueState(&PN).isOverdefined())
- return; // Quick exit
-
// Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
// and slow us down a lot. Just mark them overdefined.
if (PN.getNumIncomingValues() > 64)
return (void)markOverdefined(&PN);
- unsigned NumActiveIncoming = 0;
+ if (isInstFullyOverDefined(PN))
+ return;
+ SmallVector<unsigned> FeasibleIncomingIndices;
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+ if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+ continue;
+ FeasibleIncomingIndices.push_back(i);
+ }
// Look at all of the executable operands of the PHI node. If any of them
// are overdefined, the PHI becomes overdefined as well. If they are all
// constant, and they agree with each other, the PHI becomes the identical
// constant. If they are constant and don't agree, the PHI is a constant
// range. If there are no executable operands, the PHI remains unknown.
- ValueLatticeElement PhiState = getValueState(&PN);
- for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
- if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
- continue;
-
- const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
- PhiState.mergeIn(IV);
- NumActiveIncoming++;
- if (PhiState.isOverdefined())
- break;
+ if (StructType *STy = dyn_cast<StructType>(PN.getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ ValueLatticeElement PhiState = getStructValueState(&PN, i);
+ if (PhiState.isOverdefined())
+ continue;
+ for (unsigned j : FeasibleIncomingIndices) {
+ const ValueLatticeElement &IV =
+ getStructValueState(PN.getIncomingValue(j), i);
+ PhiState.mergeIn(IV);
+ if (PhiState.isOverdefined())
+ break;
+ }
+ ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i);
+ mergeInValue(PhiStateRef, &PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ FeasibleIncomingIndices.size() + 1));
+ PhiStateRef.setNumRangeExtensions(
+ std::max((unsigned)FeasibleIncomingIndices.size(),
+ PhiStateRef.getNumRangeExtensions()));
+ }
+ } else {
+ ValueLatticeElement PhiState = getValueState(&PN);
+ for (unsigned i : FeasibleIncomingIndices) {
+ const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
+ PhiState.mergeIn(IV);
+ if (PhiState.isOverdefined())
+ break;
+ }
+ // We allow up to 1 range extension per active incoming value and one
+ // additional extension. Note that we manually adjust the number of range
+ // extensions to match the number of active incoming values. This helps to
+ // limit multiple extensions caused by the same incoming value, if other
+ // incoming values are equal.
+ ValueLatticeElement &PhiStateRef = ValueState[&PN];
+ mergeInValue(PhiStateRef, &PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ FeasibleIncomingIndices.size() + 1));
+ PhiStateRef.setNumRangeExtensions(
+ std::max((unsigned)FeasibleIncomingIndices.size(),
+ PhiStateRef.getNumRangeExtensions()));
}
-
- // We allow up to 1 range extension per active incoming value and one
- // additional extension. Note that we manually adjust the number of range
- // extensions to match the number of active incoming values. This helps to
- // limit multiple extensions caused by the same incoming value, if other
- // incoming values are equal.
- ValueLatticeElement &PhiStateRef = ValueState[&PN];
- mergeInValue(PhiStateRef, &PN, PhiState,
- ValueLatticeElement::MergeOptions().setMaxWidenSteps(
- NumActiveIncoming + 1));
- PhiStateRef.setNumRangeExtensions(
- std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
}
void SCCPInstVisitor::visitReturnInst(ReturnInst &I) {
@@ -2127,6 +2146,21 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
}
}
+bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) {
+ // For structure Type, we handle each member separately.
+ // A structure object won't be considered as overdefined when
+ // there is at least one member that is not overdefined.
+ if (StructType *STy = dyn_cast<StructType>(Inst.getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) {
+ if (!getStructValueState(&Inst, i).isOverdefined())
+ return false;
+ }
+ return true;
+ }
+
+ return getValueState(&Inst).isOverdefined();
+}
+
void SCCPInstVisitor::solve() {
// Process the work lists until they are empty!
while (!BBWorkList.empty() || !InstWorkList.empty()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0e0b042..84d2ea6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -407,6 +407,10 @@ public:
VPBasicBlock *getParent() { return Parent; }
const VPBasicBlock *getParent() const { return Parent; }
+ /// \return the VPRegionBlock which the recipe belongs to.
+ VPRegionBlock *getRegion();
+ const VPRegionBlock *getRegion() const;
+
/// The method which generates the output IR instructions that correspond to
/// this VPRecipe, thereby "executing" the VPlan.
virtual void execute(VPTransformState &State) = 0;
@@ -4075,6 +4079,14 @@ public:
}
};
+inline VPRegionBlock *VPRecipeBase::getRegion() {
+ return getParent()->getParent();
+}
+
+inline const VPRegionBlock *VPRecipeBase::getRegion() const {
+ return getParent()->getParent();
+}
+
/// VPlan models a candidate for vectorization, encoding various decisions take
/// to produce efficient output IR, including which branches, basic-blocks and
/// output IR instructions to generate, and their cost. VPlan holds a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f413c63..7e074c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -377,7 +377,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
#ifndef NDEBUG
auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
- auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
+ VPRegionBlock *Region = R->getRegion();
if (Region && Region->isReplicator()) {
assert(Region->getNumSuccessors() == 1 &&
Region->getNumPredecessors() == 1 && "Expected SESE region!");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a98c75..d1e67e6b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2352,7 +2352,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = getParent()->getParent()->getCanonicalIV();
+ auto *CanIV = getRegion()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
getScalarType() == CanIV->getScalarType();
}
@@ -3076,7 +3076,7 @@ static void scalarizeInstruction(const Instruction *Instr,
State.AC->registerAssumption(II);
assert(
- (RepRecipe->getParent()->getParent() ||
+ (RepRecipe->getRegion() ||
!RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
all_of(RepRecipe->operands(),
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
@@ -3268,7 +3268,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
to_vector(operands()), VF);
// If the recipe is not predicated (i.e. not in a replicate region), return
// the scalar cost. Otherwise handle predicated cost.
- if (!getParent()->getParent()->isReplicator())
+ if (!getRegion()->isReplicator())
return ScalarCost;
// Account for the phi nodes that we will create.
@@ -3284,7 +3284,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
case Instruction::Store: {
// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
- const VPRegionBlock *ParentRegion = getParent()->getParent();
+ const VPRegionBlock *ParentRegion = getRegion();
if (ParentRegion && ParentRegion->isReplicator())
break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cae9aee8..f5f616f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1858,8 +1858,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
return nullptr;
VPRegionBlock *EnclosingLoopRegion =
HoistCandidate->getParent()->getEnclosingLoopRegion();
- assert((!HoistCandidate->getParent()->getParent() ||
- HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
+ assert((!HoistCandidate->getRegion() ||
+ HoistCandidate->getRegion() == EnclosingLoopRegion) &&
"CFG in VPlan should still be flat, without replicate regions");
// Hoist candidate was already visited, no need to hoist.
if (!Visited.insert(HoistCandidate).second)
@@ -2898,7 +2898,7 @@ void VPlanTransforms::replaceSymbolicStrides(
// evolution.
auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
auto *R = cast<VPRecipeBase>(&U);
- return R->getParent()->getParent() ||
+ return R->getRegion() ||
R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
};
ValueToSCEVMapTy RewriteMap;
@@ -3803,8 +3803,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
- VPRegionBlock *ParentRegion =
- cast<VPRecipeBase>(U)->getParent()->getParent();
+ VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
};
if ((isa<VPReplicateRecipe>(DefR) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index cf95ac0..9a2497e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
return true;
if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
- const VPRegionBlock *RegionOfR = Rep->getParent()->getParent();
+ const VPRegionBlock *RegionOfR = Rep->getRegion();
// Don't consider recipes in replicate regions as uniform yet; their first
// lane cannot be accessed when executing the replicate region for other
// lanes.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 7714c03..5171403 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -140,6 +140,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -344,6 +345,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b81669..7b01f13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -143,6 +143,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -347,6 +348,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i32.ll b/llvm/test/CodeGen/AMDGPU/abs_i32.ll
new file mode 100644
index 0000000..b53047f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/abs_i32.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600 %s
+
+define amdgpu_kernel void @abs_v1(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_abs_i32 s2, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; R600-LABEL: abs_v1:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, KC0[2].Z,
+; R600-NEXT: SUB_INT * T1.W, 0.0, PV.W,
+; R600-NEXT: MAX_INT T0.X, T0.W, PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
+ store i32 %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @abs_v2(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_abs_i32 s2, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; R600-LABEL: abs_v2:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[2].Z,
+; R600-NEXT: MAX_INT T0.X, KC0[2].Z, PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %neg = sub i32 0, %arg
+ %cond = icmp sgt i32 %arg, %neg
+ %res = select i1 %cond, i32 %arg, i32 %neg
+ store i32 %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @abs_v3(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_abs_i32 s2, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; R600-LABEL: abs_v3:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[2].Z,
+; R600-NEXT: MAX_INT T0.X, PV.W, KC0[2].Z,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %neg = sub i32 0, %arg
+ %cond = icmp sgt i32 %neg, %arg
+ %res = select i1 %cond, i32 %neg, i32 %arg
+ store i32 %res, ptr addrspace(1) %out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index 8088c1b..b72eba8 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -180,7 +180,11 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
; CHECK-LABEL: s_add64_32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
+; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s2, s4, 0
; CHECK-NEXT: ; return to shader part epilog
%sum64 = add i64 %val64A, %val64B
@@ -195,10 +199,14 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_add_u32 s6, s2, s6
-; CHECK-NEXT: s_addc_u32 s7, s3, s7
+; CHECK-NEXT: s_add_u32 s10, s2, s6
+; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
+; CHECK-NEXT: s_addc_u32 s8, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_add_u32 s0, s0, s4
+; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -207,8 +215,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s6
-; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: v_mov_b32_e32 v4, s10
+; CHECK-NEXT: v_mov_b32_e32 v5, s8
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -225,10 +233,14 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sub_u32 s6, s2, s6
-; CHECK-NEXT: s_subb_u32 s7, s3, s7
+; CHECK-NEXT: s_sub_u32 s10, s2, s6
+; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
+; CHECK-NEXT: s_subb_u32 s8, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_sub_u32 s0, s0, s4
+; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_subb_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -237,8 +249,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s6
-; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: v_mov_b32_e32 v4, s10
+; CHECK-NEXT: v_mov_b32_e32 v5, s8
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -256,6 +268,8 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
; CHECK-LABEL: s_uadd_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
+; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -278,6 +292,8 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -323,6 +339,8 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -345,6 +363,8 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_n1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, -1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, -1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 51df8c3..948811e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7821,9 +7821,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s0, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -7854,6 +7855,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s15, s16, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s12
; GFX6-NEXT: s_ashr_i32 s12, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s12
@@ -7879,50 +7881,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_addc_u32 s4, s4, 0
; GFX6-NEXT: s_mul_i32 s14, s7, s14
-; GFX6-NEXT: s_add_u32 s16, s1, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: s_add_u32 s14, s1, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT: s_addc_u32 s17, 0, s4
+; GFX6-NEXT: s_addc_u32 s15, 0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_mul_i32 s4, s10, s17
+; GFX6-NEXT: s_mul_i32 s4, s10, s15
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
-; GFX6-NEXT: s_mul_i32 s5, s11, s16
-; GFX6-NEXT: s_add_i32 s18, s4, s5
-; GFX6-NEXT: s_sub_i32 s14, s7, s18
-; GFX6-NEXT: s_mul_i32 s4, s10, s16
+; GFX6-NEXT: s_mul_i32 s5, s11, s14
+; GFX6-NEXT: s_add_i32 s16, s4, s5
+; GFX6-NEXT: s_sub_i32 s17, s7, s16
+; GFX6-NEXT: s_mul_i32 s4, s10, s14
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s4, s5
-; GFX6-NEXT: s_subb_u32 s19, s14, s11
-; GFX6-NEXT: s_sub_u32 s20, s6, s10
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s11
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s20, s10
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s11
-; GFX6-NEXT: s_cselect_b32 s14, s19, s15
-; GFX6-NEXT: s_add_u32 s15, s16, 1
-; GFX6-NEXT: s_addc_u32 s19, s17, 0
-; GFX6-NEXT: s_add_u32 s20, s16, 2
-; GFX6-NEXT: s_addc_u32 s21, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s20, s15
-; GFX6-NEXT: s_cselect_b32 s15, s21, s19
+; GFX6-NEXT: s_or_b32 s18, s4, s5
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_subb_u32 s17, s17, s11
+; GFX6-NEXT: s_sub_u32 s19, s6, s10
+; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_subb_u32 s4, s7, s18
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_subb_u32 s4, s17, 0
; GFX6-NEXT: s_cmp_ge_u32 s4, s11
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s6, s10
-; GFX6-NEXT: s_cselect_b32 s6, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s10
+; GFX6-NEXT: s_cselect_b32 s17, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s4, s11
-; GFX6-NEXT: s_cselect_b32 s4, s6, s5
+; GFX6-NEXT: s_cselect_b32 s4, s17, s5
+; GFX6-NEXT: s_add_u32 s5, s14, 1
+; GFX6-NEXT: s_addc_u32 s17, s15, 0
+; GFX6-NEXT: s_add_u32 s19, s14, 2
+; GFX6-NEXT: s_addc_u32 s20, s15, 0
; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_cselect_b32 s5, s15, s17
-; GFX6-NEXT: s_cselect_b32 s4, s14, s16
+; GFX6-NEXT: s_cselect_b32 s4, s19, s5
+; GFX6-NEXT: s_cselect_b32 s5, s20, s17
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_subb_u32 s7, s7, s16
+; GFX6-NEXT: s_cmp_ge_u32 s7, s11
+; GFX6-NEXT: s_cselect_b32 s16, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s6, s10
+; GFX6-NEXT: s_cselect_b32 s6, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s7, s11
+; GFX6-NEXT: s_cselect_b32 s6, s6, s16
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
+; GFX6-NEXT: s_cselect_b32 s5, s5, s15
+; GFX6-NEXT: s_cselect_b32 s4, s4, s14
; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_sub_u32 s4, s4, s6
@@ -7945,8 +7949,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s4, 0, s8
-; GFX9-NEXT: s_subb_u32 s5, 0, s9
+; GFX9-NEXT: s_sub_u32 s10, 0, s8
+; GFX9-NEXT: s_subb_u32 s11, 0, s9
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -7956,52 +7960,56 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s10, v2
-; GFX9-NEXT: v_readfirstlane_b32 s11, v1
-; GFX9-NEXT: s_mul_i32 s12, s4, s10
-; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11
-; GFX9-NEXT: s_mul_i32 s13, s5, s11
-; GFX9-NEXT: s_add_i32 s12, s14, s12
-; GFX9-NEXT: s_mul_i32 s15, s4, s11
-; GFX9-NEXT: s_add_i32 s12, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15
-; GFX9-NEXT: s_mul_i32 s16, s11, s12
-; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
+; GFX9-NEXT: v_readfirstlane_b32 s12, v2
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mul_i32 s5, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4
+; GFX9-NEXT: s_mul_i32 s13, s11, s4
+; GFX9-NEXT: s_add_i32 s5, s14, s5
+; GFX9-NEXT: s_mul_i32 s15, s10, s4
+; GFX9-NEXT: s_add_i32 s5, s5, s13
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15
+; GFX9-NEXT: s_mul_i32 s16, s4, s5
+; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5
; GFX9-NEXT: s_add_u32 s14, s14, s16
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
-; GFX9-NEXT: s_mul_i32 s15, s10, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT: s_mul_i32 s15, s12, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s12, s10, s12
-; GFX9-NEXT: s_add_u32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s5, s12, s5
+; GFX9-NEXT: s_add_u32 s5, s13, s5
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s11, s11, s12
-; GFX9-NEXT: s_addc_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_i32 s12, s4, s10
-; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11
-; GFX9-NEXT: s_add_i32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s5, s5, s11
-; GFX9-NEXT: s_add_i32 s12, s12, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s11
-; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4
-; GFX9-NEXT: s_mul_i32 s14, s10, s4
-; GFX9-NEXT: s_mul_i32 s16, s11, s12
-; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4
-; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
-; GFX9-NEXT: s_add_u32 s4, s4, s16
+; GFX9-NEXT: s_add_u32 s14, s4, s5
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s12, s12, s13
+; GFX9-NEXT: s_mul_i32 s4, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14
+; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s11, s11, s14
+; GFX9-NEXT: s_add_i32 s4, s4, s11
+; GFX9-NEXT: s_mul_i32 s10, s10, s14
+; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
+; GFX9-NEXT: s_mul_i32 s13, s12, s10
+; GFX9-NEXT: s_mul_i32 s16, s14, s4
+; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
+; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
+; GFX9-NEXT: s_add_u32 s10, s10, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s4, s4, s14
-; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12
-; GFX9-NEXT: s_addc_u32 s4, s15, s13
+; GFX9-NEXT: s_add_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
+; GFX9-NEXT: s_addc_u32 s10, s15, s11
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s12, s10, s12
-; GFX9-NEXT: s_add_u32 s4, s4, s12
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_u32 s11, s11, s4
-; GFX9-NEXT: s_addc_u32 s10, s10, s5
+; GFX9-NEXT: s_mul_i32 s4, s12, s4
+; GFX9-NEXT: s_add_u32 s4, s10, s4
+; GFX9-NEXT: s_addc_u32 s10, 0, s5
+; GFX9-NEXT: s_add_u32 s11, s14, s4
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s10, s12, s10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -8020,35 +8028,38 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_addc_u32 s11, s12, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
; GFX9-NEXT: s_mul_i32 s10, s3, s10
-; GFX9-NEXT: s_add_u32 s13, s11, s10
-; GFX9-NEXT: s_addc_u32 s12, 0, s12
-; GFX9-NEXT: s_mul_i32 s10, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13
+; GFX9-NEXT: s_add_u32 s14, s11, s10
+; GFX9-NEXT: s_addc_u32 s15, 0, s12
+; GFX9-NEXT: s_mul_i32 s10, s8, s15
+; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s9, s13
-; GFX9-NEXT: s_add_i32 s14, s10, s11
-; GFX9-NEXT: s_sub_i32 s15, s3, s14
-; GFX9-NEXT: s_mul_i32 s10, s8, s13
+; GFX9-NEXT: s_mul_i32 s11, s9, s14
+; GFX9-NEXT: s_add_i32 s16, s10, s11
+; GFX9-NEXT: s_sub_i32 s12, s3, s16
+; GFX9-NEXT: s_mul_i32 s10, s8, s14
; GFX9-NEXT: s_sub_u32 s2, s2, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_subb_u32 s15, s15, s9
-; GFX9-NEXT: s_sub_u32 s16, s2, s8
-; GFX9-NEXT: s_subb_u32 s15, s15, 0
-; GFX9-NEXT: s_cmp_ge_u32 s15, s9
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GFX9-NEXT: s_subb_u32 s17, s12, s9
+; GFX9-NEXT: s_sub_u32 s18, s2, s8
+; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GFX9-NEXT: s_subb_u32 s12, s17, 0
+; GFX9-NEXT: s_cmp_ge_u32 s12, s9
+; GFX9-NEXT: s_cselect_b32 s13, -1, 0
+; GFX9-NEXT: s_cmp_ge_u32 s18, s8
; GFX9-NEXT: s_cselect_b32 s17, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s16, s8
-; GFX9-NEXT: s_cselect_b32 s16, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s15, s9
-; GFX9-NEXT: s_cselect_b32 s15, s16, s17
-; GFX9-NEXT: s_add_u32 s16, s13, 1
-; GFX9-NEXT: s_addc_u32 s17, s12, 0
-; GFX9-NEXT: s_add_u32 s18, s13, 2
-; GFX9-NEXT: s_addc_u32 s19, s12, 0
-; GFX9-NEXT: s_cmp_lg_u32 s15, 0
-; GFX9-NEXT: s_cselect_b32 s15, s18, s16
-; GFX9-NEXT: s_cselect_b32 s16, s19, s17
+; GFX9-NEXT: s_cmp_eq_u32 s12, s9
+; GFX9-NEXT: s_cselect_b32 s12, s17, s13
+; GFX9-NEXT: s_add_u32 s13, s14, 1
+; GFX9-NEXT: s_addc_u32 s17, s15, 0
+; GFX9-NEXT: s_add_u32 s18, s14, 2
+; GFX9-NEXT: s_addc_u32 s19, s15, 0
+; GFX9-NEXT: s_cmp_lg_u32 s12, 0
+; GFX9-NEXT: s_cselect_b32 s12, s18, s13
+; GFX9-NEXT: s_cselect_b32 s13, s19, s17
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s14
+; GFX9-NEXT: s_subb_u32 s3, s3, s16
; GFX9-NEXT: s_cmp_ge_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s2, s8
@@ -8056,8 +8067,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s2, s2, s10
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_cselect_b32 s3, s16, s12
-; GFX9-NEXT: s_cselect_b32 s2, s15, s13
+; GFX9-NEXT: s_cselect_b32 s3, s13, s15
+; GFX9-NEXT: s_cselect_b32 s2, s12, s14
; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_sub_u32 s2, s2, s4
@@ -8317,9 +8328,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s17, 0, s18
; GFX6-NEXT: s_add_u32 s18, s12, s13
; GFX6-NEXT: v_mov_b32_e32 v0, s18
-; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s16, s16, s17
; GFX6-NEXT: s_mul_i32 s12, s14, s16
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
@@ -8350,6 +8362,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s15, s18, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s14, s16, s14
; GFX6-NEXT: s_ashr_i32 s12, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s12
@@ -8374,53 +8387,55 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
; GFX6-NEXT: s_addc_u32 s16, s16, 0
; GFX6-NEXT: s_mul_i32 s14, s9, s14
-; GFX6-NEXT: s_add_u32 s18, s15, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NEXT: s_add_u32 s17, s15, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s17
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: s_addc_u32 s19, 0, s16
-; GFX6-NEXT: s_mul_i32 s14, s6, s19
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_mul_i32 s14, s6, s16
; GFX6-NEXT: v_readfirstlane_b32 s15, v0
; GFX6-NEXT: s_add_i32 s14, s15, s14
-; GFX6-NEXT: s_mul_i32 s15, s7, s18
-; GFX6-NEXT: s_add_i32 s20, s14, s15
-; GFX6-NEXT: s_sub_i32 s16, s9, s20
-; GFX6-NEXT: s_mul_i32 s14, s6, s18
+; GFX6-NEXT: s_mul_i32 s15, s7, s17
+; GFX6-NEXT: s_add_i32 s18, s14, s15
+; GFX6-NEXT: s_sub_i32 s19, s9, s18
+; GFX6-NEXT: s_mul_i32 s14, s6, s17
; GFX6-NEXT: s_sub_u32 s8, s8, s14
; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s17, s14, s15
-; GFX6-NEXT: s_subb_u32 s21, s16, s7
-; GFX6-NEXT: s_sub_u32 s22, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s16, s17
-; GFX6-NEXT: s_subb_u32 s16, s21, 0
-; GFX6-NEXT: s_cmp_ge_u32 s16, s7
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s22, s6
-; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s16, s7
-; GFX6-NEXT: s_cselect_b32 s16, s21, s17
-; GFX6-NEXT: s_add_u32 s17, s18, 1
-; GFX6-NEXT: s_addc_u32 s21, s19, 0
-; GFX6-NEXT: s_add_u32 s22, s18, 2
-; GFX6-NEXT: s_addc_u32 s23, s19, 0
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_cselect_b32 s16, s22, s17
-; GFX6-NEXT: s_cselect_b32 s17, s23, s21
+; GFX6-NEXT: s_or_b32 s20, s14, s15
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_subb_u32 s19, s19, s7
+; GFX6-NEXT: s_sub_u32 s21, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s9, s9, s20
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_subb_u32 s14, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s14, s7
+; GFX6-NEXT: s_cselect_b32 s15, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s21, s6
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s14, s7
+; GFX6-NEXT: s_cselect_b32 s14, s19, s15
+; GFX6-NEXT: s_add_u32 s15, s17, 1
+; GFX6-NEXT: s_addc_u32 s19, s16, 0
+; GFX6-NEXT: s_add_u32 s21, s17, 2
+; GFX6-NEXT: s_addc_u32 s22, s16, 0
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b32 s14, s21, s15
+; GFX6-NEXT: s_cselect_b32 s15, s22, s19
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_subb_u32 s9, s9, s18
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s14, -1, 0
+; GFX6-NEXT: s_cselect_b32 s18, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s14
+; GFX6-NEXT: s_cselect_b32 s6, s6, s18
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s17, s19
-; GFX6-NEXT: s_cselect_b32 s6, s16, s18
+; GFX6-NEXT: s_cselect_b32 s7, s15, s16
+; GFX6-NEXT: s_cselect_b32 s6, s14, s17
; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT: s_sub_u32 s16, s6, s2
-; GFX6-NEXT: s_subb_u32 s17, s7, s3
+; GFX6-NEXT: s_sub_u32 s14, s6, s2
+; GFX6-NEXT: s_subb_u32 s15, s7, s3
; GFX6-NEXT: s_ashr_i32 s6, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s6
; GFX6-NEXT: s_mov_b32 s7, s6
@@ -8439,39 +8454,40 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s14
+; GFX6-NEXT: s_mul_i32 s1, s12, s16
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s13, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s15, s12, s2
+; GFX6-NEXT: s_mul_i32 s17, s12, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s18, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s15, s14, s15
+; GFX6-NEXT: s_mul_i32 s17, s16, s17
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s15
+; GFX6-NEXT: s_add_u32 s4, s4, s17
; GFX6-NEXT: s_addc_u32 s4, s5, s18
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s14, s3
+; GFX6-NEXT: s_mul_i32 s3, s16, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s4, s14, s4
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_addc_u32 s4, s16, s4
; GFX6-NEXT: s_mul_i32 s2, s12, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -8485,14 +8501,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s13, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_add_u32 s13, s15, s13
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_add_u32 s13, s17, s13
+; GFX6-NEXT: v_readfirstlane_b32 s16, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
; GFX6-NEXT: v_readfirstlane_b32 s12, v3
; GFX6-NEXT: s_add_u32 s3, s13, s3
-; GFX6-NEXT: s_addc_u32 s3, s14, s12
+; GFX6-NEXT: s_addc_u32 s3, s16, s12
; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: s_addc_u32 s12, s12, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
@@ -8501,6 +8517,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
; GFX6-NEXT: s_addc_u32 s12, s4, s12
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
@@ -8512,70 +8529,72 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mov_b32_e32 v2, s13
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
; GFX6-NEXT: s_mul_i32 s2, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v3
+; GFX6-NEXT: v_readfirstlane_b32 s17, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT: s_add_u32 s2, s15, s2
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: s_add_u32 s2, s17, s2
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
; GFX6-NEXT: s_mul_i32 s13, s11, s13
-; GFX6-NEXT: v_readfirstlane_b32 s15, v1
+; GFX6-NEXT: v_readfirstlane_b32 s17, v1
; GFX6-NEXT: s_add_u32 s2, s2, s13
-; GFX6-NEXT: s_addc_u32 s2, s14, s15
+; GFX6-NEXT: s_addc_u32 s2, s16, s17
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_addc_u32 s13, s13, 0
; GFX6-NEXT: s_mul_i32 s12, s11, s12
-; GFX6-NEXT: s_add_u32 s18, s2, s12
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NEXT: s_add_u32 s16, s2, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT: s_addc_u32 s19, 0, s13
-; GFX6-NEXT: s_mul_i32 s12, s8, s19
+; GFX6-NEXT: s_addc_u32 s17, 0, s13
+; GFX6-NEXT: s_mul_i32 s12, s8, s17
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_add_i32 s12, s13, s12
-; GFX6-NEXT: s_mul_i32 s13, s9, s18
-; GFX6-NEXT: s_add_i32 s20, s12, s13
-; GFX6-NEXT: s_sub_i32 s14, s11, s20
-; GFX6-NEXT: s_mul_i32 s12, s8, s18
+; GFX6-NEXT: s_mul_i32 s13, s9, s16
+; GFX6-NEXT: s_add_i32 s18, s12, s13
+; GFX6-NEXT: s_sub_i32 s19, s11, s18
+; GFX6-NEXT: s_mul_i32 s12, s8, s16
; GFX6-NEXT: s_sub_u32 s10, s10, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s12, s13
-; GFX6-NEXT: s_subb_u32 s21, s14, s9
-; GFX6-NEXT: s_sub_u32 s22, s10, s8
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s21, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s9
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s22, s8
-; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s9
-; GFX6-NEXT: s_cselect_b32 s14, s21, s15
-; GFX6-NEXT: s_add_u32 s15, s18, 1
-; GFX6-NEXT: s_addc_u32 s21, s19, 0
-; GFX6-NEXT: s_add_u32 s22, s18, 2
-; GFX6-NEXT: s_addc_u32 s23, s19, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s22, s15
-; GFX6-NEXT: s_cselect_b32 s15, s23, s21
+; GFX6-NEXT: s_or_b32 s20, s12, s13
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_subb_u32 s19, s19, s9
+; GFX6-NEXT: s_sub_u32 s21, s10, s8
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s11, s11, s20
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s12, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s12, s9
+; GFX6-NEXT: s_cselect_b32 s13, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s21, s8
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s12, s9
+; GFX6-NEXT: s_cselect_b32 s12, s19, s13
+; GFX6-NEXT: s_add_u32 s13, s16, 1
+; GFX6-NEXT: s_addc_u32 s19, s17, 0
+; GFX6-NEXT: s_add_u32 s21, s16, 2
+; GFX6-NEXT: s_addc_u32 s22, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_cselect_b32 s12, s21, s13
+; GFX6-NEXT: s_cselect_b32 s13, s22, s19
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_subb_u32 s11, s11, s18
; GFX6-NEXT: s_cmp_ge_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s12, -1, 0
+; GFX6-NEXT: s_cselect_b32 s18, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s10, s8
; GFX6-NEXT: s_cselect_b32 s8, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s8, s8, s12
+; GFX6-NEXT: s_cselect_b32 s8, s8, s18
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_cselect_b32 s9, s15, s19
-; GFX6-NEXT: s_cselect_b32 s8, s14, s18
+; GFX6-NEXT: s_cselect_b32 s9, s13, s17
+; GFX6-NEXT: s_cselect_b32 s8, s12, s16
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5]
; GFX6-NEXT: s_sub_u32 s4, s6, s4
; GFX6-NEXT: s_subb_u32 s5, s7, s5
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mov_b32_e32 v1, s17
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v1, s15
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -8595,8 +8614,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s12, 0, s6
-; GFX9-NEXT: s_subb_u32 s13, 0, s7
+; GFX9-NEXT: s_sub_u32 s14, 0, s6
+; GFX9-NEXT: s_subb_u32 s15, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8605,52 +8624,56 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s14, v1
-; GFX9-NEXT: v_readfirstlane_b32 s15, v0
-; GFX9-NEXT: s_mul_i32 s16, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15
-; GFX9-NEXT: s_mul_i32 s17, s13, s15
-; GFX9-NEXT: s_add_i32 s16, s18, s16
-; GFX9-NEXT: s_mul_i32 s19, s12, s15
-; GFX9-NEXT: s_add_i32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19
-; GFX9-NEXT: s_mul_i32 s20, s15, s16
-; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16
+; GFX9-NEXT: v_readfirstlane_b32 s16, v1
+; GFX9-NEXT: v_readfirstlane_b32 s12, v0
+; GFX9-NEXT: s_mul_i32 s13, s14, s16
+; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12
+; GFX9-NEXT: s_mul_i32 s17, s15, s12
+; GFX9-NEXT: s_add_i32 s13, s18, s13
+; GFX9-NEXT: s_mul_i32 s19, s14, s12
+; GFX9-NEXT: s_add_i32 s13, s13, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19
+; GFX9-NEXT: s_mul_i32 s20, s12, s13
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13
; GFX9-NEXT: s_add_u32 s18, s18, s20
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19
-; GFX9-NEXT: s_mul_i32 s19, s14, s19
+; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19
+; GFX9-NEXT: s_mul_i32 s19, s16, s19
; GFX9-NEXT: s_add_u32 s18, s18, s19
-; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16
+; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13
; GFX9-NEXT: s_addc_u32 s17, s17, s20
; GFX9-NEXT: s_addc_u32 s18, s21, 0
-; GFX9-NEXT: s_mul_i32 s16, s14, s16
-; GFX9-NEXT: s_add_u32 s16, s17, s16
+; GFX9-NEXT: s_mul_i32 s13, s16, s13
+; GFX9-NEXT: s_add_u32 s13, s17, s13
; GFX9-NEXT: s_addc_u32 s17, 0, s18
-; GFX9-NEXT: s_add_u32 s15, s15, s16
-; GFX9-NEXT: s_addc_u32 s14, s14, s17
-; GFX9-NEXT: s_mul_i32 s16, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT: s_add_i32 s16, s17, s16
-; GFX9-NEXT: s_mul_i32 s13, s13, s15
-; GFX9-NEXT: s_add_i32 s16, s16, s13
-; GFX9-NEXT: s_mul_i32 s12, s12, s15
-; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12
-; GFX9-NEXT: s_mul_i32 s18, s14, s12
-; GFX9-NEXT: s_mul_i32 s20, s15, s16
-; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12
-; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16
-; GFX9-NEXT: s_add_u32 s12, s12, s20
+; GFX9-NEXT: s_add_u32 s18, s12, s13
+; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GFX9-NEXT: s_addc_u32 s16, s16, s17
+; GFX9-NEXT: s_mul_i32 s12, s14, s16
+; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18
+; GFX9-NEXT: s_add_i32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s15, s15, s18
+; GFX9-NEXT: s_add_i32 s12, s12, s15
+; GFX9-NEXT: s_mul_i32 s14, s14, s18
+; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14
+; GFX9-NEXT: s_mul_i32 s17, s16, s14
+; GFX9-NEXT: s_mul_i32 s20, s18, s12
+; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14
+; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12
+; GFX9-NEXT: s_add_u32 s14, s14, s20
; GFX9-NEXT: s_addc_u32 s19, 0, s19
-; GFX9-NEXT: s_add_u32 s12, s12, s18
-; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16
-; GFX9-NEXT: s_addc_u32 s12, s19, s17
+; GFX9-NEXT: s_add_u32 s14, s14, s17
+; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12
+; GFX9-NEXT: s_addc_u32 s14, s19, s15
; GFX9-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NEXT: s_mul_i32 s16, s14, s16
-; GFX9-NEXT: s_add_u32 s12, s12, s16
-; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_add_u32 s15, s15, s12
-; GFX9-NEXT: s_addc_u32 s14, s14, s13
+; GFX9-NEXT: s_mul_i32 s12, s16, s12
+; GFX9-NEXT: s_add_u32 s12, s14, s12
+; GFX9-NEXT: s_addc_u32 s14, 0, s13
+; GFX9-NEXT: s_add_u32 s15, s18, s12
+; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GFX9-NEXT: s_addc_u32 s14, s16, s14
; GFX9-NEXT: s_ashr_i32 s12, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s12
; GFX9-NEXT: s_mov_b32 s13, s12
@@ -8668,35 +8691,38 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_addc_u32 s15, s16, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
; GFX9-NEXT: s_mul_i32 s14, s9, s14
-; GFX9-NEXT: s_add_u32 s17, s15, s14
-; GFX9-NEXT: s_addc_u32 s16, 0, s16
-; GFX9-NEXT: s_mul_i32 s14, s6, s16
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17
+; GFX9-NEXT: s_add_u32 s18, s15, s14
+; GFX9-NEXT: s_addc_u32 s19, 0, s16
+; GFX9-NEXT: s_mul_i32 s14, s6, s19
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18
; GFX9-NEXT: s_add_i32 s14, s15, s14
-; GFX9-NEXT: s_mul_i32 s15, s7, s17
-; GFX9-NEXT: s_add_i32 s18, s14, s15
-; GFX9-NEXT: s_sub_i32 s19, s9, s18
-; GFX9-NEXT: s_mul_i32 s14, s6, s17
+; GFX9-NEXT: s_mul_i32 s15, s7, s18
+; GFX9-NEXT: s_add_i32 s20, s14, s15
+; GFX9-NEXT: s_sub_i32 s16, s9, s20
+; GFX9-NEXT: s_mul_i32 s14, s6, s18
; GFX9-NEXT: s_sub_u32 s8, s8, s14
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_subb_u32 s19, s19, s7
-; GFX9-NEXT: s_sub_u32 s20, s8, s6
-; GFX9-NEXT: s_subb_u32 s19, s19, 0
-; GFX9-NEXT: s_cmp_ge_u32 s19, s7
+; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
+; GFX9-NEXT: s_subb_u32 s21, s16, s7
+; GFX9-NEXT: s_sub_u32 s22, s8, s6
+; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GFX9-NEXT: s_subb_u32 s16, s21, 0
+; GFX9-NEXT: s_cmp_ge_u32 s16, s7
+; GFX9-NEXT: s_cselect_b32 s17, -1, 0
+; GFX9-NEXT: s_cmp_ge_u32 s22, s6
; GFX9-NEXT: s_cselect_b32 s21, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s20, s6
-; GFX9-NEXT: s_cselect_b32 s20, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s19, s7
-; GFX9-NEXT: s_cselect_b32 s19, s20, s21
-; GFX9-NEXT: s_add_u32 s20, s17, 1
-; GFX9-NEXT: s_addc_u32 s21, s16, 0
-; GFX9-NEXT: s_add_u32 s22, s17, 2
-; GFX9-NEXT: s_addc_u32 s23, s16, 0
-; GFX9-NEXT: s_cmp_lg_u32 s19, 0
-; GFX9-NEXT: s_cselect_b32 s19, s22, s20
-; GFX9-NEXT: s_cselect_b32 s20, s23, s21
+; GFX9-NEXT: s_cmp_eq_u32 s16, s7
+; GFX9-NEXT: s_cselect_b32 s16, s21, s17
+; GFX9-NEXT: s_add_u32 s17, s18, 1
+; GFX9-NEXT: s_addc_u32 s21, s19, 0
+; GFX9-NEXT: s_add_u32 s22, s18, 2
+; GFX9-NEXT: s_addc_u32 s23, s19, 0
+; GFX9-NEXT: s_cmp_lg_u32 s16, 0
+; GFX9-NEXT: s_cselect_b32 s16, s22, s17
+; GFX9-NEXT: s_cselect_b32 s17, s23, s21
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s18
+; GFX9-NEXT: s_subb_u32 s9, s9, s20
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s14, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8704,12 +8730,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s14
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s20, s16
-; GFX9-NEXT: s_cselect_b32 s6, s19, s17
+; GFX9-NEXT: s_cselect_b32 s7, s17, s19
+; GFX9-NEXT: s_cselect_b32 s6, s16, s18
; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX9-NEXT: s_sub_u32 s12, s6, s2
-; GFX9-NEXT: s_subb_u32 s13, s7, s3
+; GFX9-NEXT: s_sub_u32 s14, s6, s2
+; GFX9-NEXT: s_subb_u32 s15, s7, s3
; GFX9-NEXT: s_ashr_i32 s2, s1, 31
; GFX9-NEXT: s_add_u32 s0, s0, s2
; GFX9-NEXT: s_mov_b32 s3, s2
@@ -8718,8 +8744,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s4, 0, s6
-; GFX9-NEXT: s_subb_u32 s5, 0, s7
+; GFX9-NEXT: s_sub_u32 s8, 0, s6
+; GFX9-NEXT: s_subb_u32 s9, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -8729,98 +8755,105 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s8, v1
-; GFX9-NEXT: v_readfirstlane_b32 s15, v2
-; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8
-; GFX9-NEXT: s_mul_i32 s16, s4, s15
-; GFX9-NEXT: s_mul_i32 s9, s5, s8
-; GFX9-NEXT: s_add_i32 s14, s14, s16
-; GFX9-NEXT: s_add_i32 s14, s14, s9
-; GFX9-NEXT: s_mul_i32 s17, s4, s8
-; GFX9-NEXT: s_mul_i32 s16, s8, s14
-; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17
-; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_readfirstlane_b32 s13, v2
+; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
+; GFX9-NEXT: s_mul_i32 s16, s8, s13
+; GFX9-NEXT: s_mul_i32 s5, s9, s4
+; GFX9-NEXT: s_add_i32 s12, s12, s16
+; GFX9-NEXT: s_add_i32 s12, s12, s5
+; GFX9-NEXT: s_mul_i32 s17, s8, s4
+; GFX9-NEXT: s_mul_i32 s16, s4, s12
+; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17
+; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12
; GFX9-NEXT: s_add_u32 s16, s18, s16
-; GFX9-NEXT: s_addc_u32 s9, 0, s9
-; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17
-; GFX9-NEXT: s_mul_i32 s17, s15, s17
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17
+; GFX9-NEXT: s_mul_i32 s17, s13, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14
-; GFX9-NEXT: s_addc_u32 s9, s9, s19
+; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12
+; GFX9-NEXT: s_addc_u32 s5, s5, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
-; GFX9-NEXT: s_mul_i32 s14, s15, s14
-; GFX9-NEXT: s_add_u32 s9, s9, s14
-; GFX9-NEXT: s_addc_u32 s14, 0, s16
-; GFX9-NEXT: s_add_u32 s8, s8, s9
-; GFX9-NEXT: s_addc_u32 s9, s15, s14
-; GFX9-NEXT: s_mul_i32 s14, s4, s9
-; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8
-; GFX9-NEXT: s_add_i32 s14, s15, s14
-; GFX9-NEXT: s_mul_i32 s5, s5, s8
-; GFX9-NEXT: s_add_i32 s14, s14, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4
-; GFX9-NEXT: s_mul_i32 s16, s9, s4
-; GFX9-NEXT: s_mul_i32 s18, s8, s14
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14
-; GFX9-NEXT: s_add_u32 s4, s4, s18
+; GFX9-NEXT: s_mul_i32 s12, s13, s12
+; GFX9-NEXT: s_add_u32 s5, s5, s12
+; GFX9-NEXT: s_addc_u32 s12, 0, s16
+; GFX9-NEXT: s_add_u32 s16, s4, s5
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s4, s8, s12
+; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16
+; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s9, s9, s16
+; GFX9-NEXT: s_add_i32 s4, s4, s9
+; GFX9-NEXT: s_mul_i32 s8, s8, s16
+; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
+; GFX9-NEXT: s_mul_i32 s13, s12, s8
+; GFX9-NEXT: s_mul_i32 s18, s16, s4
+; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8
+; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4
+; GFX9-NEXT: s_add_u32 s8, s8, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s4, s4, s16
-; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14
-; GFX9-NEXT: s_addc_u32 s4, s17, s15
+; GFX9-NEXT: s_add_u32 s8, s8, s13
+; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
+; GFX9-NEXT: s_addc_u32 s8, s17, s9
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s14, s9, s14
-; GFX9-NEXT: s_add_u32 s4, s4, s14
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_u32 s14, s8, s4
-; GFX9-NEXT: s_addc_u32 s15, s9, s5
+; GFX9-NEXT: s_mul_i32 s4, s12, s4
+; GFX9-NEXT: s_add_u32 s4, s8, s4
+; GFX9-NEXT: s_addc_u32 s8, 0, s5
+; GFX9-NEXT: s_add_u32 s13, s16, s4
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s12, s12, s8
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s8, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s9, s11, s4
; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s8, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14
-; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15
+; GFX9-NEXT: s_mul_i32 s11, s8, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13
+; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX9-NEXT: s_add_u32 s11, s16, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14
-; GFX9-NEXT: s_mul_i32 s14, s9, s14
-; GFX9-NEXT: s_add_u32 s11, s11, s14
-; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13
+; GFX9-NEXT: s_mul_i32 s13, s9, s13
+; GFX9-NEXT: s_add_u32 s11, s11, s13
+; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12
; GFX9-NEXT: s_addc_u32 s10, s10, s17
; GFX9-NEXT: s_addc_u32 s11, s16, 0
-; GFX9-NEXT: s_mul_i32 s14, s9, s15
-; GFX9-NEXT: s_add_u32 s14, s10, s14
-; GFX9-NEXT: s_addc_u32 s15, 0, s11
-; GFX9-NEXT: s_mul_i32 s10, s6, s15
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14
+; GFX9-NEXT: s_mul_i32 s12, s9, s12
+; GFX9-NEXT: s_add_u32 s16, s10, s12
+; GFX9-NEXT: s_addc_u32 s17, 0, s11
+; GFX9-NEXT: s_mul_i32 s10, s6, s17
+; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s7, s14
-; GFX9-NEXT: s_add_i32 s16, s10, s11
-; GFX9-NEXT: s_sub_i32 s17, s9, s16
-; GFX9-NEXT: s_mul_i32 s10, s6, s14
+; GFX9-NEXT: s_mul_i32 s11, s7, s16
+; GFX9-NEXT: s_add_i32 s18, s10, s11
+; GFX9-NEXT: s_sub_i32 s12, s9, s18
+; GFX9-NEXT: s_mul_i32 s10, s6, s16
; GFX9-NEXT: s_sub_u32 s8, s8, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_subb_u32 s17, s17, s7
-; GFX9-NEXT: s_sub_u32 s18, s8, s6
-; GFX9-NEXT: s_subb_u32 s17, s17, 0
-; GFX9-NEXT: s_cmp_ge_u32 s17, s7
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GFX9-NEXT: s_subb_u32 s19, s12, s7
+; GFX9-NEXT: s_sub_u32 s20, s8, s6
+; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GFX9-NEXT: s_subb_u32 s12, s19, 0
+; GFX9-NEXT: s_cmp_ge_u32 s12, s7
+; GFX9-NEXT: s_cselect_b32 s13, -1, 0
+; GFX9-NEXT: s_cmp_ge_u32 s20, s6
; GFX9-NEXT: s_cselect_b32 s19, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s18, s6
-; GFX9-NEXT: s_cselect_b32 s18, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s17, s7
-; GFX9-NEXT: s_cselect_b32 s17, s18, s19
-; GFX9-NEXT: s_add_u32 s18, s14, 1
-; GFX9-NEXT: s_addc_u32 s19, s15, 0
-; GFX9-NEXT: s_add_u32 s20, s14, 2
-; GFX9-NEXT: s_addc_u32 s21, s15, 0
-; GFX9-NEXT: s_cmp_lg_u32 s17, 0
-; GFX9-NEXT: s_cselect_b32 s17, s20, s18
-; GFX9-NEXT: s_cselect_b32 s18, s21, s19
+; GFX9-NEXT: s_cmp_eq_u32 s12, s7
+; GFX9-NEXT: s_cselect_b32 s12, s19, s13
+; GFX9-NEXT: s_add_u32 s13, s16, 1
+; GFX9-NEXT: s_addc_u32 s19, s17, 0
+; GFX9-NEXT: s_add_u32 s20, s16, 2
+; GFX9-NEXT: s_addc_u32 s21, s17, 0
+; GFX9-NEXT: s_cmp_lg_u32 s12, 0
+; GFX9-NEXT: s_cselect_b32 s12, s20, s13
+; GFX9-NEXT: s_cselect_b32 s13, s21, s19
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s16
+; GFX9-NEXT: s_subb_u32 s9, s9, s18
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8828,14 +8861,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s10
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s18, s15
-; GFX9-NEXT: s_cselect_b32 s6, s17, s14
+; GFX9-NEXT: s_cselect_b32 s7, s13, s17
+; GFX9-NEXT: s_cselect_b32 s6, s12, s16
; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3]
; GFX9-NEXT: s_sub_u32 s2, s4, s2
; GFX9-NEXT: s_subb_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-NEXT: v_mov_b32_e32 v2, s15
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -9056,9 +9089,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s13, 0, s14
; GFX6-NEXT: s_add_u32 s14, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s13
; GFX6-NEXT: s_mul_i32 s0, s10, s12
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -9089,6 +9123,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s13, s14, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s10
; GFX6-NEXT: s_ashr_i32 s10, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s10
@@ -9123,43 +9158,46 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
; GFX6-NEXT: s_mul_i32 s5, s9, s12
-; GFX6-NEXT: s_add_i32 s14, s4, s5
-; GFX6-NEXT: s_sub_i32 s13, s7, s14
+; GFX6-NEXT: s_add_i32 s13, s4, s5
+; GFX6-NEXT: s_sub_i32 s14, s7, s13
; GFX6-NEXT: s_mul_i32 s4, s8, s12
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX6-NEXT: s_or_b32 s12, s4, s5
-; GFX6-NEXT: s_subb_u32 s15, s13, s9
-; GFX6-NEXT: s_sub_u32 s16, s6, s8
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s17, s12, s13
-; GFX6-NEXT: s_subb_u32 s17, s15, 0
-; GFX6-NEXT: s_cmp_ge_u32 s17, s9
-; GFX6-NEXT: s_cselect_b32 s18, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s16, s8
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s17, s9
-; GFX6-NEXT: s_cselect_b32 s18, s19, s18
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s15, s15, s9
-; GFX6-NEXT: s_sub_u32 s19, s16, s8
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s12, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_cselect_b32 s13, s19, s16
-; GFX6-NEXT: s_cselect_b32 s12, s12, s17
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s14, s14, s9
+; GFX6-NEXT: s_sub_u32 s15, s6, s8
+; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_subb_u32 s4, s7, s14
-; GFX6-NEXT: s_cmp_ge_u32 s4, s9
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_subb_u32 s16, s14, 0
+; GFX6-NEXT: s_cmp_ge_u32 s16, s9
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s6, s8
+; GFX6-NEXT: s_cmp_ge_u32 s15, s8
+; GFX6-NEXT: s_cselect_b32 s17, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s16, s9
+; GFX6-NEXT: s_cselect_b32 s17, s17, s5
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_subb_u32 s14, s14, s9
+; GFX6-NEXT: s_sub_u32 s18, s15, s8
+; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT: s_or_b32 s4, s4, s5
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_subb_u32 s4, s14, 0
+; GFX6-NEXT: s_cmp_lg_u32 s17, 0
+; GFX6-NEXT: s_cselect_b32 s14, s18, s15
+; GFX6-NEXT: s_cselect_b32 s4, s4, s16
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s5, s7, s13
+; GFX6-NEXT: s_cmp_ge_u32 s5, s9
; GFX6-NEXT: s_cselect_b32 s7, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s4, s9
-; GFX6-NEXT: s_cselect_b32 s5, s7, s5
-; GFX6-NEXT: s_cmp_lg_u32 s5, 0
-; GFX6-NEXT: s_cselect_b32 s5, s12, s4
-; GFX6-NEXT: s_cselect_b32 s4, s13, s6
+; GFX6-NEXT: s_cmp_ge_u32 s6, s8
+; GFX6-NEXT: s_cselect_b32 s8, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s5, s9
+; GFX6-NEXT: s_cselect_b32 s7, s8, s7
+; GFX6-NEXT: s_cmp_lg_u32 s7, 0
+; GFX6-NEXT: s_cselect_b32 s5, s4, s5
+; GFX6-NEXT: s_cselect_b32 s4, s14, s6
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
; GFX6-NEXT: s_sub_u32 s4, s4, s10
; GFX6-NEXT: s_subb_u32 s5, s5, s10
@@ -9181,8 +9219,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s4, 0, s6
-; GFX9-NEXT: s_subb_u32 s5, 0, s7
+; GFX9-NEXT: s_sub_u32 s8, 0, s6
+; GFX9-NEXT: s_subb_u32 s9, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9192,52 +9230,56 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v2
-; GFX9-NEXT: v_readfirstlane_b32 s9, v1
-; GFX9-NEXT: s_mul_i32 s10, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9
-; GFX9-NEXT: s_mul_i32 s11, s5, s9
-; GFX9-NEXT: s_add_i32 s10, s12, s10
-; GFX9-NEXT: s_mul_i32 s13, s4, s9
-; GFX9-NEXT: s_add_i32 s10, s10, s11
-; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13
-; GFX9-NEXT: s_mul_i32 s14, s9, s10
-; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10
+; GFX9-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mul_i32 s5, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
+; GFX9-NEXT: s_mul_i32 s11, s9, s4
+; GFX9-NEXT: s_add_i32 s5, s12, s5
+; GFX9-NEXT: s_mul_i32 s13, s8, s4
+; GFX9-NEXT: s_add_i32 s5, s5, s11
+; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13
+; GFX9-NEXT: s_mul_i32 s14, s4, s5
+; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5
; GFX9-NEXT: s_add_u32 s12, s12, s14
; GFX9-NEXT: s_addc_u32 s11, 0, s11
-; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13
-; GFX9-NEXT: s_mul_i32 s13, s8, s13
+; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13
+; GFX9-NEXT: s_mul_i32 s13, s10, s13
; GFX9-NEXT: s_add_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5
; GFX9-NEXT: s_addc_u32 s11, s11, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
-; GFX9-NEXT: s_mul_i32 s10, s8, s10
-; GFX9-NEXT: s_add_u32 s10, s11, s10
+; GFX9-NEXT: s_mul_i32 s5, s10, s5
+; GFX9-NEXT: s_add_u32 s5, s11, s5
; GFX9-NEXT: s_addc_u32 s11, 0, s12
-; GFX9-NEXT: s_add_u32 s9, s9, s10
-; GFX9-NEXT: s_addc_u32 s8, s8, s11
-; GFX9-NEXT: s_mul_i32 s10, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9
-; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s5, s5, s9
-; GFX9-NEXT: s_add_i32 s10, s10, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s9
-; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4
-; GFX9-NEXT: s_mul_i32 s12, s8, s4
-; GFX9-NEXT: s_mul_i32 s14, s9, s10
-; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4
-; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10
-; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_add_u32 s12, s4, s5
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s10, s10, s11
+; GFX9-NEXT: s_mul_i32 s4, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12
+; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s9, s9, s12
+; GFX9-NEXT: s_add_i32 s4, s4, s9
+; GFX9-NEXT: s_mul_i32 s8, s8, s12
+; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8
+; GFX9-NEXT: s_mul_i32 s11, s10, s8
+; GFX9-NEXT: s_mul_i32 s14, s12, s4
+; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8
+; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4
+; GFX9-NEXT: s_add_u32 s8, s8, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_add_u32 s4, s4, s12
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10
-; GFX9-NEXT: s_addc_u32 s4, s13, s11
+; GFX9-NEXT: s_add_u32 s8, s8, s11
+; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4
+; GFX9-NEXT: s_addc_u32 s8, s13, s9
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s10, s8, s10
-; GFX9-NEXT: s_add_u32 s4, s4, s10
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_u32 s9, s9, s4
-; GFX9-NEXT: s_addc_u32 s8, s8, s5
+; GFX9-NEXT: s_mul_i32 s4, s10, s4
+; GFX9-NEXT: s_add_u32 s4, s8, s4
+; GFX9-NEXT: s_addc_u32 s8, 0, s5
+; GFX9-NEXT: s_add_u32 s9, s12, s4
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s8, s10, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -9267,9 +9309,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_mul_i32 s8, s6, s8
; GFX9-NEXT: s_sub_u32 s2, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s13, s10, s7
; GFX9-NEXT: s_sub_u32 s14, s2, s6
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s15, s13, 0
; GFX9-NEXT: s_cmp_ge_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
@@ -9278,11 +9322,13 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, s17, s16
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s13, s7
-; GFX9-NEXT: s_sub_u32 s11, s14, s6
-; GFX9-NEXT: s_subb_u32 s10, s10, 0
+; GFX9-NEXT: s_subb_u32 s13, s13, s7
+; GFX9-NEXT: s_sub_u32 s17, s14, s6
+; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GFX9-NEXT: s_subb_u32 s10, s13, 0
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b32 s11, s11, s14
+; GFX9-NEXT: s_cselect_b32 s11, s17, s14
; GFX9-NEXT: s_cselect_b32 s10, s10, s15
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s3, s3, s12
@@ -9444,9 +9490,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s6, s7
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_or_b32 s6, s6, s7
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s6, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s7, v0
@@ -9477,6 +9524,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s16, s6
; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: s_or_b32 s6, s6, s7
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s12, s14, s12
; GFX6-NEXT: s_ashr_i32 s6, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s6
@@ -9509,46 +9557,49 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_add_i32 s13, s14, s13
; GFX6-NEXT: s_mul_i32 s14, s3, s12
-; GFX6-NEXT: s_add_i32 s16, s13, s14
-; GFX6-NEXT: s_sub_i32 s14, s9, s16
+; GFX6-NEXT: s_add_i32 s14, s13, s14
+; GFX6-NEXT: s_sub_i32 s15, s9, s14
; GFX6-NEXT: s_mul_i32 s12, s2, s12
; GFX6-NEXT: s_sub_u32 s8, s8, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s12, s13
-; GFX6-NEXT: s_subb_u32 s17, s14, s3
-; GFX6-NEXT: s_sub_u32 s18, s8, s2
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s19, s14, s15
-; GFX6-NEXT: s_subb_u32 s19, s17, 0
-; GFX6-NEXT: s_cmp_ge_u32 s19, s3
-; GFX6-NEXT: s_cselect_b32 s20, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s2
-; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s19, s3
-; GFX6-NEXT: s_cselect_b32 s20, s21, s20
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s17, s17, s3
-; GFX6-NEXT: s_sub_u32 s21, s18, s2
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b32 s15, s21, s18
-; GFX6-NEXT: s_cselect_b32 s14, s14, s19
+; GFX6-NEXT: s_or_b32 s16, s12, s13
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_subb_u32 s15, s15, s3
+; GFX6-NEXT: s_sub_u32 s17, s8, s2
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s18, s15, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s3
+; GFX6-NEXT: s_cselect_b32 s13, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s17, s2
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s18, s3
+; GFX6-NEXT: s_cselect_b32 s19, s19, s13
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s15, s15, s3
+; GFX6-NEXT: s_sub_u32 s20, s17, s2
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s9, s9, s16
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s12, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s19, 0
+; GFX6-NEXT: s_cselect_b32 s13, s20, s17
+; GFX6-NEXT: s_cselect_b32 s12, s12, s18
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_subb_u32 s9, s9, s14
; GFX6-NEXT: s_cmp_ge_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s12, -1, 0
+; GFX6-NEXT: s_cselect_b32 s14, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s2
; GFX6-NEXT: s_cselect_b32 s2, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s2, s2, s12
+; GFX6-NEXT: s_cselect_b32 s2, s2, s14
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_cselect_b32 s3, s14, s9
-; GFX6-NEXT: s_cselect_b32 s2, s15, s8
+; GFX6-NEXT: s_cselect_b32 s3, s12, s9
+; GFX6-NEXT: s_cselect_b32 s2, s13, s8
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT: s_sub_u32 s14, s2, s6
-; GFX6-NEXT: s_subb_u32 s15, s3, s6
+; GFX6-NEXT: s_sub_u32 s12, s2, s6
+; GFX6-NEXT: s_subb_u32 s13, s3, s6
; GFX6-NEXT: s_ashr_i32 s2, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s2
; GFX6-NEXT: s_mov_b32 s3, s2
@@ -9567,39 +9618,40 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s8, s12
+; GFX6-NEXT: s_mul_i32 s1, s8, s14
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s9, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s13, s8, s2
+; GFX6-NEXT: s_mul_i32 s15, s8, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s16, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s13, s12, s13
+; GFX6-NEXT: s_mul_i32 s15, s14, s15
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s13
+; GFX6-NEXT: s_add_u32 s4, s4, s15
; GFX6-NEXT: s_addc_u32 s4, s5, s16
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s12, s3
+; GFX6-NEXT: s_mul_i32 s3, s14, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s4, s12, s4
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_addc_u32 s4, s14, s4
; GFX6-NEXT: s_mul_i32 s2, s8, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -9613,98 +9665,102 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s9, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s13, v2
-; GFX6-NEXT: s_add_u32 s9, s13, s9
-; GFX6-NEXT: v_readfirstlane_b32 s12, v0
+; GFX6-NEXT: v_readfirstlane_b32 s15, v2
+; GFX6-NEXT: s_add_u32 s9, s15, s9
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s12, 0, s12
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
; GFX6-NEXT: v_readfirstlane_b32 s8, v3
; GFX6-NEXT: s_add_u32 s3, s9, s3
-; GFX6-NEXT: s_addc_u32 s3, s12, s8
+; GFX6-NEXT: s_addc_u32 s3, s14, s8
; GFX6-NEXT: v_readfirstlane_b32 s8, v1
; GFX6-NEXT: s_addc_u32 s8, s8, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
; GFX6-NEXT: s_add_u32 s2, s3, s2
; GFX6-NEXT: s_addc_u32 s8, 0, s8
-; GFX6-NEXT: s_add_u32 s12, s5, s2
+; GFX6-NEXT: s_add_u32 s14, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s13, s4, s8
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_addc_u32 s15, s4, s8
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_addc_u32 s3, s11, s4
; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s15
; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s12
+; GFX6-NEXT: v_mov_b32_e32 v2, s14
; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2
-; GFX6-NEXT: s_mul_i32 s2, s8, s13
+; GFX6-NEXT: s_mul_i32 s2, s8, s15
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2
; GFX6-NEXT: v_readfirstlane_b32 s11, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX6-NEXT: s_add_u32 s2, s11, s2
; GFX6-NEXT: s_addc_u32 s10, 0, s10
-; GFX6-NEXT: s_mul_i32 s11, s9, s12
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: s_mul_i32 s11, s9, s14
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: s_add_u32 s2, s2, s11
-; GFX6-NEXT: s_addc_u32 s2, s10, s12
+; GFX6-NEXT: s_addc_u32 s2, s10, s14
; GFX6-NEXT: v_readfirstlane_b32 s10, v0
; GFX6-NEXT: s_addc_u32 s10, s10, 0
-; GFX6-NEXT: s_mul_i32 s11, s9, s13
+; GFX6-NEXT: s_mul_i32 s11, s9, s15
; GFX6-NEXT: s_add_u32 s11, s2, s11
; GFX6-NEXT: v_mov_b32_e32 v0, s11
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX6-NEXT: s_addc_u32 s10, 0, s10
; GFX6-NEXT: s_mul_i32 s10, s6, s10
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: v_readfirstlane_b32 s12, v0
-; GFX6-NEXT: s_add_i32 s10, s12, s10
-; GFX6-NEXT: s_mul_i32 s12, s7, s11
-; GFX6-NEXT: s_add_i32 s16, s10, s12
-; GFX6-NEXT: s_sub_i32 s12, s9, s16
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: s_add_i32 s10, s14, s10
+; GFX6-NEXT: s_mul_i32 s14, s7, s11
+; GFX6-NEXT: s_add_i32 s14, s10, s14
+; GFX6-NEXT: s_sub_i32 s15, s9, s14
; GFX6-NEXT: s_mul_i32 s10, s6, s11
; GFX6-NEXT: s_sub_u32 s8, s8, s10
; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s13, s10, s11
-; GFX6-NEXT: s_subb_u32 s17, s12, s7
-; GFX6-NEXT: s_sub_u32 s18, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s19, s12, s13
-; GFX6-NEXT: s_subb_u32 s19, s17, 0
-; GFX6-NEXT: s_cmp_ge_u32 s19, s7
-; GFX6-NEXT: s_cselect_b32 s20, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s6
-; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s19, s7
-; GFX6-NEXT: s_cselect_b32 s20, s21, s20
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s17, s17, s7
-; GFX6-NEXT: s_sub_u32 s21, s18, s6
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s12, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b32 s13, s21, s18
-; GFX6-NEXT: s_cselect_b32 s12, s12, s19
+; GFX6-NEXT: s_or_b32 s16, s10, s11
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_subb_u32 s15, s15, s7
+; GFX6-NEXT: s_sub_u32 s17, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
; GFX6-NEXT: s_or_b32 s10, s10, s11
-; GFX6-NEXT: s_subb_u32 s9, s9, s16
+; GFX6-NEXT: s_cmp_lg_u32 s10, 0
+; GFX6-NEXT: s_subb_u32 s18, s15, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s7
+; GFX6-NEXT: s_cselect_b32 s11, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s17, s6
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s18, s7
+; GFX6-NEXT: s_cselect_b32 s19, s19, s11
+; GFX6-NEXT: s_cmp_lg_u32 s10, 0
+; GFX6-NEXT: s_subb_u32 s15, s15, s7
+; GFX6-NEXT: s_sub_u32 s20, s17, s6
+; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX6-NEXT: s_or_b32 s10, s10, s11
+; GFX6-NEXT: s_cmp_lg_u32 s10, 0
+; GFX6-NEXT: s_subb_u32 s10, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s19, 0
+; GFX6-NEXT: s_cselect_b32 s11, s20, s17
+; GFX6-NEXT: s_cselect_b32 s10, s10, s18
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_subb_u32 s9, s9, s14
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s10, -1, 0
+; GFX6-NEXT: s_cselect_b32 s14, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s10
+; GFX6-NEXT: s_cselect_b32 s6, s6, s14
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s12, s9
-; GFX6-NEXT: s_cselect_b32 s6, s13, s8
+; GFX6-NEXT: s_cselect_b32 s7, s10, s9
+; GFX6-NEXT: s_cselect_b32 s6, s11, s8
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_u32 s5, s6, s4
; GFX6-NEXT: s_subb_u32 s4, s7, s4
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
+; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mov_b32_e32 v1, s13
; GFX6-NEXT: v_mov_b32_e32 v2, s5
; GFX6-NEXT: v_mov_b32_e32 v3, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9724,8 +9780,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT: s_sub_u32 s6, 0, s2
-; GFX9-NEXT: s_subb_u32 s7, 0, s3
+; GFX9-NEXT: s_sub_u32 s12, 0, s2
+; GFX9-NEXT: s_subb_u32 s13, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9734,52 +9790,56 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: v_readfirstlane_b32 s13, v0
-; GFX9-NEXT: s_mul_i32 s14, s6, s12
-; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13
-; GFX9-NEXT: s_mul_i32 s15, s7, s13
-; GFX9-NEXT: s_add_i32 s14, s16, s14
-; GFX9-NEXT: s_mul_i32 s17, s6, s13
-; GFX9-NEXT: s_add_i32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17
-; GFX9-NEXT: s_mul_i32 s18, s13, s14
-; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14
+; GFX9-NEXT: v_readfirstlane_b32 s14, v1
+; GFX9-NEXT: v_readfirstlane_b32 s6, v0
+; GFX9-NEXT: s_mul_i32 s7, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6
+; GFX9-NEXT: s_mul_i32 s15, s13, s6
+; GFX9-NEXT: s_add_i32 s7, s16, s7
+; GFX9-NEXT: s_mul_i32 s17, s12, s6
+; GFX9-NEXT: s_add_i32 s7, s7, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17
+; GFX9-NEXT: s_mul_i32 s18, s6, s7
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7
; GFX9-NEXT: s_add_u32 s16, s16, s18
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17
-; GFX9-NEXT: s_mul_i32 s17, s12, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17
+; GFX9-NEXT: s_mul_i32 s17, s14, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7
; GFX9-NEXT: s_addc_u32 s15, s15, s18
; GFX9-NEXT: s_addc_u32 s16, s19, 0
-; GFX9-NEXT: s_mul_i32 s14, s12, s14
-; GFX9-NEXT: s_add_u32 s14, s15, s14
+; GFX9-NEXT: s_mul_i32 s7, s14, s7
+; GFX9-NEXT: s_add_u32 s7, s15, s7
; GFX9-NEXT: s_addc_u32 s15, 0, s16
-; GFX9-NEXT: s_add_u32 s13, s13, s14
-; GFX9-NEXT: s_addc_u32 s12, s12, s15
-; GFX9-NEXT: s_mul_i32 s14, s6, s12
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13
-; GFX9-NEXT: s_add_i32 s14, s15, s14
-; GFX9-NEXT: s_mul_i32 s7, s7, s13
-; GFX9-NEXT: s_add_i32 s14, s14, s7
-; GFX9-NEXT: s_mul_i32 s6, s6, s13
-; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6
-; GFX9-NEXT: s_mul_i32 s16, s12, s6
-; GFX9-NEXT: s_mul_i32 s18, s13, s14
-; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6
-; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14
-; GFX9-NEXT: s_add_u32 s6, s6, s18
+; GFX9-NEXT: s_add_u32 s16, s6, s7
+; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-NEXT: s_addc_u32 s14, s14, s15
+; GFX9-NEXT: s_mul_i32 s6, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16
+; GFX9-NEXT: s_add_i32 s6, s7, s6
+; GFX9-NEXT: s_mul_i32 s13, s13, s16
+; GFX9-NEXT: s_add_i32 s6, s6, s13
+; GFX9-NEXT: s_mul_i32 s12, s12, s16
+; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12
+; GFX9-NEXT: s_mul_i32 s15, s14, s12
+; GFX9-NEXT: s_mul_i32 s18, s16, s6
+; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12
+; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6
+; GFX9-NEXT: s_add_u32 s12, s12, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s6, s6, s16
-; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14
-; GFX9-NEXT: s_addc_u32 s6, s17, s15
+; GFX9-NEXT: s_add_u32 s12, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6
+; GFX9-NEXT: s_addc_u32 s12, s17, s13
; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s14, s12, s14
-; GFX9-NEXT: s_add_u32 s6, s6, s14
-; GFX9-NEXT: s_addc_u32 s7, 0, s7
-; GFX9-NEXT: s_add_u32 s13, s13, s6
-; GFX9-NEXT: s_addc_u32 s12, s12, s7
+; GFX9-NEXT: s_mul_i32 s6, s14, s6
+; GFX9-NEXT: s_add_u32 s6, s12, s6
+; GFX9-NEXT: s_addc_u32 s12, 0, s7
+; GFX9-NEXT: s_add_u32 s13, s16, s6
+; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-NEXT: s_addc_u32 s12, s14, s12
; GFX9-NEXT: s_ashr_i32 s6, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s6
; GFX9-NEXT: s_mov_b32 s7, s6
@@ -9808,9 +9868,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s12, s2, s12
; GFX9-NEXT: s_sub_u32 s8, s8, s12
; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s17, s14, s3
; GFX9-NEXT: s_sub_u32 s18, s8, s2
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
; GFX9-NEXT: s_subb_u32 s19, s17, 0
; GFX9-NEXT: s_cmp_ge_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, -1, 0
@@ -9819,11 +9881,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, s21, s20
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s14, s17, s3
-; GFX9-NEXT: s_sub_u32 s15, s18, s2
-; GFX9-NEXT: s_subb_u32 s14, s14, 0
+; GFX9-NEXT: s_subb_u32 s17, s17, s3
+; GFX9-NEXT: s_sub_u32 s21, s18, s2
+; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
+; GFX9-NEXT: s_subb_u32 s14, s17, 0
; GFX9-NEXT: s_cmp_lg_u32 s20, 0
-; GFX9-NEXT: s_cselect_b32 s15, s15, s18
+; GFX9-NEXT: s_cselect_b32 s15, s21, s18
; GFX9-NEXT: s_cselect_b32 s14, s14, s19
; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s9, s9, s16
@@ -9847,8 +9911,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s4, 0, s2
-; GFX9-NEXT: s_subb_u32 s5, 0, s3
+; GFX9-NEXT: s_sub_u32 s6, 0, s2
+; GFX9-NEXT: s_subb_u32 s7, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9858,70 +9922,74 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: v_readfirstlane_b32 s9, v2
-; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
-; GFX9-NEXT: s_mul_i32 s14, s4, s9
-; GFX9-NEXT: s_mul_i32 s7, s5, s6
+; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4
+; GFX9-NEXT: s_mul_i32 s14, s6, s9
+; GFX9-NEXT: s_mul_i32 s5, s7, s4
; GFX9-NEXT: s_add_i32 s8, s8, s14
-; GFX9-NEXT: s_add_i32 s8, s8, s7
-; GFX9-NEXT: s_mul_i32 s15, s4, s6
-; GFX9-NEXT: s_mul_i32 s14, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8
+; GFX9-NEXT: s_add_i32 s8, s8, s5
+; GFX9-NEXT: s_mul_i32 s15, s6, s4
+; GFX9-NEXT: s_mul_i32 s14, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15
+; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8
; GFX9-NEXT: s_add_u32 s14, s16, s14
-; GFX9-NEXT: s_addc_u32 s7, 0, s7
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15
; GFX9-NEXT: s_mul_i32 s15, s9, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8
-; GFX9-NEXT: s_addc_u32 s7, s7, s17
+; GFX9-NEXT: s_addc_u32 s5, s5, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
; GFX9-NEXT: s_mul_i32 s8, s9, s8
-; GFX9-NEXT: s_add_u32 s7, s7, s8
+; GFX9-NEXT: s_add_u32 s5, s5, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s14
-; GFX9-NEXT: s_add_u32 s6, s6, s7
-; GFX9-NEXT: s_addc_u32 s7, s9, s8
-; GFX9-NEXT: s_mul_i32 s8, s4, s7
-; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6
-; GFX9-NEXT: s_add_i32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s5, s5, s6
-; GFX9-NEXT: s_add_i32 s8, s8, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s6
-; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4
-; GFX9-NEXT: s_mul_i32 s14, s7, s4
-; GFX9-NEXT: s_mul_i32 s16, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8
-; GFX9-NEXT: s_add_u32 s4, s4, s16
+; GFX9-NEXT: s_add_u32 s14, s4, s5
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s8, s9, s8
+; GFX9-NEXT: s_mul_i32 s4, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14
+; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s7, s7, s14
+; GFX9-NEXT: s_add_i32 s4, s4, s7
+; GFX9-NEXT: s_mul_i32 s6, s6, s14
+; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6
+; GFX9-NEXT: s_mul_i32 s9, s8, s6
+; GFX9-NEXT: s_mul_i32 s16, s14, s4
+; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6
+; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
+; GFX9-NEXT: s_add_u32 s6, s6, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s4, s4, s14
-; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8
-; GFX9-NEXT: s_addc_u32 s4, s15, s9
+; GFX9-NEXT: s_add_u32 s6, s6, s9
+; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4
+; GFX9-NEXT: s_addc_u32 s6, s15, s7
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s8, s7, s8
-; GFX9-NEXT: s_add_u32 s4, s4, s8
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_u32 s8, s6, s4
-; GFX9-NEXT: s_addc_u32 s9, s7, s5
+; GFX9-NEXT: s_mul_i32 s4, s8, s4
+; GFX9-NEXT: s_add_u32 s4, s6, s4
+; GFX9-NEXT: s_addc_u32 s6, 0, s5
+; GFX9-NEXT: s_add_u32 s9, s14, s4
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s8, s8, s6
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s6, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s7, s11, s4
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s6, s9
-; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9
+; GFX9-NEXT: s_mul_i32 s11, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9
+; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8
; GFX9-NEXT: s_add_u32 s11, s14, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8
-; GFX9-NEXT: s_mul_i32 s8, s7, s8
-; GFX9-NEXT: s_add_u32 s8, s11, s8
-; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9
-; GFX9-NEXT: s_addc_u32 s8, s10, s15
-; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9
; GFX9-NEXT: s_mul_i32 s9, s7, s9
-; GFX9-NEXT: s_add_u32 s8, s8, s9
+; GFX9-NEXT: s_add_u32 s9, s11, s9
+; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8
+; GFX9-NEXT: s_addc_u32 s9, s10, s15
+; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_i32 s8, s7, s8
+; GFX9-NEXT: s_add_u32 s8, s9, s8
; GFX9-NEXT: s_addc_u32 s9, 0, s10
; GFX9-NEXT: s_mul_i32 s9, s2, s9
; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8
@@ -9932,9 +10000,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s8, s2, s8
; GFX9-NEXT: s_sub_u32 s6, s6, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s15, s10, s3
; GFX9-NEXT: s_sub_u32 s16, s6, s2
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s17, s15, 0
; GFX9-NEXT: s_cmp_ge_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, -1, 0
@@ -9943,11 +10013,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, s19, s18
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s15, s3
-; GFX9-NEXT: s_sub_u32 s11, s16, s2
-; GFX9-NEXT: s_subb_u32 s10, s10, 0
+; GFX9-NEXT: s_subb_u32 s15, s15, s3
+; GFX9-NEXT: s_sub_u32 s19, s16, s2
+; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GFX9-NEXT: s_subb_u32 s10, s15, 0
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_cselect_b32 s11, s11, s16
+; GFX9-NEXT: s_cselect_b32 s11, s19, s16
; GFX9-NEXT: s_cselect_b32 s10, s10, s17
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s7, s7, s14
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 01f4414..394727c 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -612,11 +612,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -652,11 +653,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -691,10 +693,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -730,10 +733,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -770,10 +774,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -813,10 +818,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -853,10 +859,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -894,15 +901,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -992,11 +999,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1034,11 +1042,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1075,10 +1084,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1117,10 +1127,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1160,10 +1171,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1206,10 +1218,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1248,10 +1261,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1292,15 +1306,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2059,11 +2073,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2099,11 +2114,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2138,10 +2154,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2177,10 +2194,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2217,10 +2235,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2260,10 +2279,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2301,10 +2321,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2342,15 +2363,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 9db6d70..258bc295 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -717,11 +717,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -761,11 +762,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -803,12 +805,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -850,10 +853,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -893,13 +897,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -944,10 +949,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -987,14 +993,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1022,7 +1028,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -1036,15 +1041,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2358,6 +2363,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2410,6 +2416,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2455,12 +2462,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2
; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2507,12 +2515,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1
; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2560,13 +2569,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2616,13 +2626,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2666,16 +2677,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2720,17 +2731,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4479,11 +4490,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4538,11 +4550,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4595,12 +4608,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4656,10 +4670,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4713,13 +4728,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2
; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4783,10 +4799,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4844,14 +4861,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4879,7 +4896,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4893,15 +4909,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -6657,6 +6673,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6729,6 +6746,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6794,12 +6812,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6864,12 +6883,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6935,13 +6955,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7015,13 +7036,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7087,16 +7109,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7141,17 +7163,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 6167a84..23c5f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -499,11 +499,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -539,11 +540,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -578,10 +580,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -618,10 +621,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -659,10 +663,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -702,10 +707,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1082,10 +1088,11 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1110,10 +1117,11 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1139,8 +1147,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1167,8 +1176,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1196,8 +1206,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1227,8 +1239,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2008,6 +2022,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2056,6 +2071,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2096,12 +2112,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2143,12 +2160,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2191,13 +2209,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2242,13 +2261,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2861,6 +2881,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2893,6 +2914,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2924,6 +2946,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2956,6 +2979,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2989,6 +3013,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3022,8 +3048,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3879,11 +3906,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3919,11 +3947,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3958,10 +3987,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3998,10 +4028,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4039,10 +4070,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4082,10 +4114,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4462,10 +4495,11 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4490,10 +4524,11 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4519,8 +4554,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4547,8 +4583,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4576,8 +4613,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4607,8 +4646,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5411,6 +5452,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5459,6 +5501,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5499,12 +5542,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5546,12 +5590,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5594,13 +5639,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -5645,13 +5691,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -6266,11 +6313,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6306,11 +6354,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6345,10 +6394,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6385,10 +6435,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6426,10 +6477,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6469,10 +6521,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6873,11 +6926,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6919,11 +6973,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6960,14 +7015,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7009,11 +7065,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7052,15 +7109,16 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7105,11 +7163,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7613,11 +7672,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7653,11 +7713,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7692,10 +7753,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7732,10 +7794,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7773,10 +7836,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -7816,10 +7880,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -8219,11 +8284,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8265,11 +8331,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8306,14 +8373,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8355,11 +8423,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8398,15 +8467,16 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8451,11 +8521,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8959,11 +9030,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8999,11 +9071,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9038,10 +9111,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9078,10 +9152,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9119,10 +9194,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9162,10 +9238,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9565,11 +9642,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9611,11 +9689,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9652,14 +9731,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9701,11 +9781,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9744,15 +9825,16 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9797,11 +9879,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10305,11 +10388,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10345,11 +10429,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10384,10 +10469,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10424,10 +10510,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10465,10 +10552,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -10508,10 +10596,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -11166,6 +11255,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11221,6 +11311,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11272,6 +11363,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11323,6 +11415,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11375,8 +11468,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11431,8 +11525,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12119,11 +12214,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12159,11 +12255,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12198,10 +12295,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12238,10 +12336,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12279,10 +12378,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12322,10 +12422,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12980,6 +13081,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13035,6 +13137,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13086,6 +13189,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13137,6 +13241,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13189,8 +13294,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13245,8 +13351,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13933,11 +14040,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13973,11 +14081,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14012,10 +14121,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14052,10 +14162,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14093,10 +14204,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14136,10 +14248,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14788,6 +14901,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14842,6 +14956,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14892,6 +15007,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14942,6 +15058,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14995,6 +15112,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15050,6 +15169,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15732,11 +15853,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15772,11 +15894,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15811,10 +15934,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15851,10 +15975,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15892,10 +16017,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -15935,10 +16061,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16588,6 +16715,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16642,6 +16770,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16692,6 +16821,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16742,6 +16872,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16795,6 +16926,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -16850,6 +16983,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 9afc0c6..e4def28 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -611,11 +611,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -651,11 +652,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -690,10 +692,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -729,10 +732,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -769,10 +773,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -812,10 +817,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -852,10 +858,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -893,15 +900,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1658,11 +1665,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1698,11 +1706,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1737,10 +1746,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1776,10 +1786,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1816,10 +1827,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1859,10 +1871,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1900,10 +1913,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1941,15 +1955,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 10fd34f..39a3c9a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -628,11 +628,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -669,11 +670,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -709,10 +711,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -749,10 +752,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -790,10 +794,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -834,10 +839,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -874,10 +880,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -916,15 +923,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1826,11 +1833,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1867,11 +1875,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1907,10 +1916,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1947,10 +1957,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1988,10 +1999,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2032,10 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2073,10 +2086,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2115,15 +2129,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b96de17..4a6fa4f 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -704,6 +704,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_add_u32 s4, s4, s6
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
; CISI-NEXT: s_or_b32 s6, s12, s13
+; CISI-NEXT: s_cmp_lg_u32 s6, 0
; CISI-NEXT: s_addc_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -724,14 +725,16 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_addc_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -743,10 +746,12 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s12, s14
-; GFX9-NEXT: s_addc_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_add_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_addc_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -759,8 +764,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_add_u32 s0, s12, s14
-; GFX1010-NEXT: s_addc_u32 s1, s13, s15
+; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1010-NEXT: s_addc_u32 s1, s13, s15
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -774,8 +781,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_add_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -789,8 +798,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_add_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -803,8 +814,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_addc_u32 s5, s5, s7
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -818,8 +831,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_add_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1676,6 +1691,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_sub_u32 s4, s4, s6
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
; CISI-NEXT: s_or_b32 s6, s12, s13
+; CISI-NEXT: s_cmp_lg_u32 s6, 0
; CISI-NEXT: s_subb_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -1696,14 +1712,16 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_subb_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -1715,10 +1733,12 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s0, s12, s14
-; GFX9-NEXT: s_subb_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_sub_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_subb_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -1731,8 +1751,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_sub_u32 s0, s12, s14
-; GFX1010-NEXT: s_subb_u32 s1, s13, s15
+; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1010-NEXT: s_subb_u32 s1, s13, s15
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1746,8 +1768,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1761,8 +1785,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1775,8 +1801,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_subb_u32 s5, s5, s7
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1790,8 +1818,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -2188,46 +2218,49 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: s_addc_u32 s6, s7, s9
; VI-NEXT: s_addc_u32 s8, s8, 0
; VI-NEXT: v_readfirstlane_b32 s7, v0
-; VI-NEXT: s_add_u32 s10, s6, s7
-; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: s_add_u32 s12, s6, s7
+; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0
-; VI-NEXT: s_addc_u32 s11, 0, s8
-; VI-NEXT: s_mul_i32 s8, s4, s11
+; VI-NEXT: s_addc_u32 s13, 0, s8
+; VI-NEXT: s_mul_i32 s8, s4, s13
; VI-NEXT: v_readfirstlane_b32 s9, v1
; VI-NEXT: s_add_i32 s8, s9, s8
-; VI-NEXT: s_mul_i32 s9, s5, s10
-; VI-NEXT: s_add_i32 s12, s8, s9
-; VI-NEXT: s_sub_i32 s13, s3, s12
+; VI-NEXT: s_mul_i32 s9, s5, s12
+; VI-NEXT: s_add_i32 s14, s8, s9
+; VI-NEXT: s_sub_i32 s10, s3, s14
; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: s_sub_u32 s14, s2, s8
+; VI-NEXT: s_sub_u32 s15, s2, s8
; VI-NEXT: s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT: s_subb_u32 s13, s13, s5
-; VI-NEXT: s_sub_u32 s15, s14, s4
-; VI-NEXT: s_subb_u32 s13, s13, 0
-; VI-NEXT: s_cmp_ge_u32 s13, s5
+; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
+; VI-NEXT: s_subb_u32 s16, s10, s5
+; VI-NEXT: s_sub_u32 s17, s15, s4
+; VI-NEXT: s_cselect_b64 s[10:11], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[10:11], 0
+; VI-NEXT: s_subb_u32 s10, s16, 0
+; VI-NEXT: s_cmp_ge_u32 s10, s5
+; VI-NEXT: s_cselect_b32 s11, -1, 0
+; VI-NEXT: s_cmp_ge_u32 s17, s4
; VI-NEXT: s_cselect_b32 s16, -1, 0
-; VI-NEXT: s_cmp_ge_u32 s15, s4
-; VI-NEXT: s_cselect_b32 s15, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s13, s5
-; VI-NEXT: s_cselect_b32 s13, s15, s16
-; VI-NEXT: s_add_u32 s15, s10, 1
-; VI-NEXT: s_addc_u32 s16, s11, 0
-; VI-NEXT: s_add_u32 s17, s10, 2
-; VI-NEXT: s_addc_u32 s18, s11, 0
-; VI-NEXT: s_cmp_lg_u32 s13, 0
-; VI-NEXT: s_cselect_b32 s13, s17, s15
-; VI-NEXT: s_cselect_b32 s15, s18, s16
+; VI-NEXT: s_cmp_eq_u32 s10, s5
+; VI-NEXT: s_cselect_b32 s10, s16, s11
+; VI-NEXT: s_add_u32 s11, s12, 1
+; VI-NEXT: s_addc_u32 s16, s13, 0
+; VI-NEXT: s_add_u32 s17, s12, 2
+; VI-NEXT: s_addc_u32 s18, s13, 0
+; VI-NEXT: s_cmp_lg_u32 s10, 0
+; VI-NEXT: s_cselect_b32 s10, s17, s11
+; VI-NEXT: s_cselect_b32 s11, s18, s16
; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT: s_subb_u32 s3, s3, s12
+; VI-NEXT: s_subb_u32 s3, s3, s14
; VI-NEXT: s_cmp_ge_u32 s3, s5
; VI-NEXT: s_cselect_b32 s8, -1, 0
-; VI-NEXT: s_cmp_ge_u32 s14, s4
+; VI-NEXT: s_cmp_ge_u32 s15, s4
; VI-NEXT: s_cselect_b32 s9, -1, 0
; VI-NEXT: s_cmp_eq_u32 s3, s5
; VI-NEXT: s_cselect_b32 s3, s9, s8
; VI-NEXT: s_cmp_lg_u32 s3, 0
-; VI-NEXT: s_cselect_b32 s9, s15, s11
-; VI-NEXT: s_cselect_b32 s8, s13, s10
+; VI-NEXT: s_cselect_b32 s9, s11, s13
+; VI-NEXT: s_cselect_b32 s8, s10, s12
; VI-NEXT: s_cbranch_execnz .LBB16_4
; VI-NEXT: .LBB16_2:
; VI-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2278,8 +2311,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s8, 0, s6
-; GFX9-NEXT: s_subb_u32 s9, 0, s7
+; GFX9-NEXT: s_sub_u32 s10, 0, s6
+; GFX9-NEXT: s_subb_u32 s11, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2288,102 +2321,109 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s10, v1
-; GFX9-NEXT: v_readfirstlane_b32 s11, v0
-; GFX9-NEXT: s_mul_i32 s12, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11
-; GFX9-NEXT: s_mul_i32 s13, s9, s11
-; GFX9-NEXT: s_add_i32 s12, s14, s12
-; GFX9-NEXT: s_add_i32 s12, s12, s13
-; GFX9-NEXT: s_mul_i32 s15, s8, s11
-; GFX9-NEXT: s_mul_i32 s14, s11, s12
-; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15
-; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
+; GFX9-NEXT: v_readfirstlane_b32 s12, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v0
+; GFX9-NEXT: s_mul_i32 s9, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8
+; GFX9-NEXT: s_mul_i32 s13, s11, s8
+; GFX9-NEXT: s_add_i32 s9, s14, s9
+; GFX9-NEXT: s_add_i32 s9, s9, s13
+; GFX9-NEXT: s_mul_i32 s15, s10, s8
+; GFX9-NEXT: s_mul_i32 s14, s8, s9
+; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15
+; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9
; GFX9-NEXT: s_add_u32 s14, s16, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
-; GFX9-NEXT: s_mul_i32 s15, s10, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT: s_mul_i32 s15, s12, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s12, s10, s12
-; GFX9-NEXT: s_add_u32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s9, s12, s9
+; GFX9-NEXT: s_add_u32 s9, s13, s9
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s11, s11, s12
-; GFX9-NEXT: s_addc_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_i32 s12, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11
-; GFX9-NEXT: s_add_i32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s9, s9, s11
-; GFX9-NEXT: s_add_i32 s12, s12, s9
-; GFX9-NEXT: s_mul_i32 s8, s8, s11
-; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8
-; GFX9-NEXT: s_mul_i32 s14, s10, s8
-; GFX9-NEXT: s_mul_i32 s16, s11, s12
-; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8
-; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
-; GFX9-NEXT: s_add_u32 s8, s8, s16
+; GFX9-NEXT: s_add_u32 s14, s8, s9
+; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX9-NEXT: s_addc_u32 s12, s12, s13
+; GFX9-NEXT: s_mul_i32 s8, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14
+; GFX9-NEXT: s_add_i32 s8, s9, s8
+; GFX9-NEXT: s_mul_i32 s11, s11, s14
+; GFX9-NEXT: s_add_i32 s8, s8, s11
+; GFX9-NEXT: s_mul_i32 s10, s10, s14
+; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
+; GFX9-NEXT: s_mul_i32 s13, s12, s10
+; GFX9-NEXT: s_mul_i32 s16, s14, s8
+; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
+; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8
+; GFX9-NEXT: s_add_u32 s10, s10, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s8, s8, s14
-; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12
-; GFX9-NEXT: s_addc_u32 s8, s15, s13
+; GFX9-NEXT: s_add_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
+; GFX9-NEXT: s_addc_u32 s10, s15, s11
; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_mul_i32 s12, s10, s12
-; GFX9-NEXT: s_add_u32 s8, s8, s12
+; GFX9-NEXT: s_mul_i32 s8, s12, s8
+; GFX9-NEXT: s_add_u32 s8, s10, s8
+; GFX9-NEXT: s_addc_u32 s10, 0, s9
+; GFX9-NEXT: s_add_u32 s11, s14, s8
+; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX9-NEXT: s_addc_u32 s8, s12, s10
+; GFX9-NEXT: s_mul_i32 s10, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11
+; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8
+; GFX9-NEXT: s_add_u32 s10, s12, s10
; GFX9-NEXT: s_addc_u32 s9, 0, s9
-; GFX9-NEXT: s_add_u32 s8, s11, s8
-; GFX9-NEXT: s_addc_u32 s9, s10, s9
-; GFX9-NEXT: s_mul_i32 s11, s2, s9
-; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8
-; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9
-; GFX9-NEXT: s_add_u32 s11, s12, s11
-; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8
-; GFX9-NEXT: s_mul_i32 s8, s3, s8
-; GFX9-NEXT: s_add_u32 s8, s11, s8
-; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9
-; GFX9-NEXT: s_addc_u32 s8, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11
+; GFX9-NEXT: s_mul_i32 s11, s3, s11
+; GFX9-NEXT: s_add_u32 s10, s10, s11
+; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8
+; GFX9-NEXT: s_addc_u32 s9, s9, s13
; GFX9-NEXT: s_addc_u32 s10, s12, 0
-; GFX9-NEXT: s_mul_i32 s9, s3, s9
-; GFX9-NEXT: s_add_u32 s11, s8, s9
-; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_i32 s8, s6, s10
-; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11
+; GFX9-NEXT: s_mul_i32 s8, s3, s8
+; GFX9-NEXT: s_add_u32 s12, s9, s8
+; GFX9-NEXT: s_addc_u32 s13, 0, s10
+; GFX9-NEXT: s_mul_i32 s8, s6, s13
+; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12
; GFX9-NEXT: s_add_i32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s9, s7, s11
-; GFX9-NEXT: s_add_i32 s12, s8, s9
-; GFX9-NEXT: s_sub_i32 s13, s3, s12
-; GFX9-NEXT: s_mul_i32 s8, s6, s11
-; GFX9-NEXT: s_sub_u32 s14, s2, s8
+; GFX9-NEXT: s_mul_i32 s9, s7, s12
+; GFX9-NEXT: s_add_i32 s14, s8, s9
+; GFX9-NEXT: s_sub_i32 s10, s3, s14
+; GFX9-NEXT: s_mul_i32 s8, s6, s12
+; GFX9-NEXT: s_sub_u32 s15, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_subb_u32 s13, s13, s7
-; GFX9-NEXT: s_sub_u32 s15, s14, s6
-; GFX9-NEXT: s_subb_u32 s13, s13, 0
-; GFX9-NEXT: s_cmp_ge_u32 s13, s7
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX9-NEXT: s_subb_u32 s16, s10, s7
+; GFX9-NEXT: s_sub_u32 s17, s15, s6
+; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GFX9-NEXT: s_subb_u32 s10, s16, 0
+; GFX9-NEXT: s_cmp_ge_u32 s10, s7
+; GFX9-NEXT: s_cselect_b32 s11, -1, 0
+; GFX9-NEXT: s_cmp_ge_u32 s17, s6
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s15, s6
-; GFX9-NEXT: s_cselect_b32 s15, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s13, s7
-; GFX9-NEXT: s_cselect_b32 s13, s15, s16
-; GFX9-NEXT: s_add_u32 s15, s11, 1
-; GFX9-NEXT: s_addc_u32 s16, s10, 0
-; GFX9-NEXT: s_add_u32 s17, s11, 2
-; GFX9-NEXT: s_addc_u32 s18, s10, 0
-; GFX9-NEXT: s_cmp_lg_u32 s13, 0
-; GFX9-NEXT: s_cselect_b32 s13, s17, s15
-; GFX9-NEXT: s_cselect_b32 s15, s18, s16
+; GFX9-NEXT: s_cmp_eq_u32 s10, s7
+; GFX9-NEXT: s_cselect_b32 s10, s16, s11
+; GFX9-NEXT: s_add_u32 s11, s12, 1
+; GFX9-NEXT: s_addc_u32 s16, s13, 0
+; GFX9-NEXT: s_add_u32 s17, s12, 2
+; GFX9-NEXT: s_addc_u32 s18, s13, 0
+; GFX9-NEXT: s_cmp_lg_u32 s10, 0
+; GFX9-NEXT: s_cselect_b32 s10, s17, s11
+; GFX9-NEXT: s_cselect_b32 s11, s18, s16
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s12
+; GFX9-NEXT: s_subb_u32 s3, s3, s14
; GFX9-NEXT: s_cmp_ge_u32 s3, s7
; GFX9-NEXT: s_cselect_b32 s8, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s14, s6
+; GFX9-NEXT: s_cmp_ge_u32 s15, s6
; GFX9-NEXT: s_cselect_b32 s9, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s3, s7
; GFX9-NEXT: s_cselect_b32 s3, s9, s8
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
-; GFX9-NEXT: s_cselect_b32 s9, s15, s10
-; GFX9-NEXT: s_cselect_b32 s8, s13, s11
+; GFX9-NEXT: s_cselect_b32 s9, s11, s13
+; GFX9-NEXT: s_cselect_b32 s8, s10, s12
; GFX9-NEXT: s_cbranch_execnz .LBB16_3
; GFX9-NEXT: .LBB16_2:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
@@ -2463,40 +2503,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_add_u32 s11, s12, s11
; GFX1010-NEXT: s_addc_u32 s12, 0, s13
; GFX1010-NEXT: s_add_u32 s8, s8, s11
+; GFX1010-NEXT: s_cselect_b32 s11, -1, 0
+; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8
+; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1010-NEXT: s_mul_i32 s11, s9, s8
; GFX1010-NEXT: s_addc_u32 s5, s5, s12
-; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1010-NEXT: s_mul_i32 s12, s9, s8
-; GFX1010-NEXT: s_mul_i32 s9, s9, s5
; GFX1010-NEXT: s_mul_i32 s10, s10, s8
-; GFX1010-NEXT: s_add_i32 s9, s11, s9
-; GFX1010-NEXT: s_mul_i32 s11, s5, s12
+; GFX1010-NEXT: s_mul_i32 s9, s9, s5
+; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11
+; GFX1010-NEXT: s_add_i32 s9, s13, s9
+; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11
; GFX1010-NEXT: s_add_i32 s9, s9, s10
-; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX1010-NEXT: s_mul_i32 s10, s5, s11
; GFX1010-NEXT: s_mul_i32 s15, s8, s9
; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1010-NEXT: s_add_u32 s10, s10, s15
-; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12
+; GFX1010-NEXT: s_add_u32 s12, s12, s15
; GFX1010-NEXT: s_addc_u32 s14, 0, s14
-; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9
-; GFX1010-NEXT: s_add_u32 s10, s10, s11
+; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9
+; GFX1010-NEXT: s_add_u32 s10, s12, s10
; GFX1010-NEXT: s_mul_i32 s9, s5, s9
; GFX1010-NEXT: s_addc_u32 s10, s14, s13
-; GFX1010-NEXT: s_addc_u32 s11, s12, 0
+; GFX1010-NEXT: s_addc_u32 s11, s11, 0
; GFX1010-NEXT: s_add_u32 s9, s10, s9
; GFX1010-NEXT: s_addc_u32 s10, 0, s11
; GFX1010-NEXT: s_add_u32 s8, s8, s9
+; GFX1010-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8
+; GFX1010-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1010-NEXT: s_addc_u32 s5, s5, s10
-; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1010-NEXT: s_mul_i32 s12, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8
; GFX1010-NEXT: s_mul_i32 s8, s3, s8
-; GFX1010-NEXT: s_add_u32 s9, s9, s12
-; GFX1010-NEXT: s_addc_u32 s11, 0, s11
+; GFX1010-NEXT: s_mul_i32 s12, s2, s5
+; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5
+; GFX1010-NEXT: s_add_u32 s11, s11, s12
+; GFX1010-NEXT: s_addc_u32 s10, 0, s10
; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT: s_add_u32 s8, s9, s8
+; GFX1010-NEXT: s_add_u32 s8, s11, s8
; GFX1010-NEXT: s_mul_i32 s5, s3, s5
-; GFX1010-NEXT: s_addc_u32 s8, s11, s10
+; GFX1010-NEXT: s_addc_u32 s8, s10, s9
; GFX1010-NEXT: s_addc_u32 s9, s13, 0
; GFX1010-NEXT: s_add_u32 s5, s8, s5
; GFX1010-NEXT: s_addc_u32 s8, 0, s9
@@ -2509,8 +2553,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_sub_i32 s11, s3, s9
; GFX1010-NEXT: s_sub_u32 s10, s2, s10
; GFX1010-NEXT: s_cselect_b32 s12, -1, 0
+; GFX1010-NEXT: s_cmp_lg_u32 s12, 0
; GFX1010-NEXT: s_subb_u32 s11, s11, s7
; GFX1010-NEXT: s_sub_u32 s13, s10, s6
+; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
+; GFX1010-NEXT: s_cmp_lg_u32 s14, 0
; GFX1010-NEXT: s_subb_u32 s11, s11, 0
; GFX1010-NEXT: s_cmp_ge_u32 s11, s7
; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
@@ -2616,40 +2663,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_add_u32 s11, s12, s11
; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13
; GFX1030W32-NEXT: s_add_u32 s8, s8, s11
+; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8
+; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7
; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
-; GFX1030W32-NEXT: s_add_i32 s9, s11, s9
-; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12
+; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7
+; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11
+; GFX1030W32-NEXT: s_add_i32 s9, s13, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11
; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11
; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9
; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1030W32-NEXT: s_add_u32 s10, s10, s15
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12
+; GFX1030W32-NEXT: s_add_u32 s12, s12, s15
; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9
-; GFX1030W32-NEXT: s_add_u32 s10, s10, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9
+; GFX1030W32-NEXT: s_add_u32 s10, s12, s10
; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9
; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13
-; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0
+; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0
; GFX1030W32-NEXT: s_add_u32 s9, s10, s9
; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11
; GFX1030W32-NEXT: s_add_u32 s8, s8, s9
+; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8
+; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8
; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
-; GFX1030W32-NEXT: s_add_u32 s9, s9, s12
-; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11
+; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7
+; GFX1030W32-NEXT: s_add_u32 s11, s11, s12
+; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10
; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT: s_add_u32 s8, s9, s8
+; GFX1030W32-NEXT: s_add_u32 s8, s11, s8
; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10
+; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9
; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0
; GFX1030W32-NEXT: s_add_u32 s7, s8, s7
; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9
@@ -2662,8 +2713,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9
; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10
; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0
+; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0
; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5
; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4
+; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
+; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0
; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0
; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5
; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
@@ -2736,8 +2790,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: ; %bb.1:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5
-; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4
-; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5
+; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4
+; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0
; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2746,102 +2800,109 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1
-; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0
-; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7
-; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7
-; GFX1030W64-NEXT: s_add_i32 s10, s12, s10
-; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7
-; GFX1030W64-NEXT: s_add_i32 s10, s10, s11
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13
-; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10
-; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13
-; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10
+; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1
+; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0
+; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6
+; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6
+; GFX1030W64-NEXT: s_add_i32 s7, s12, s7
+; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6
+; GFX1030W64-NEXT: s_add_i32 s7, s7, s11
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13
+; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7
+; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13
+; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7
; GFX1030W64-NEXT: s_add_u32 s12, s12, s15
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10
+; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7
; GFX1030W64-NEXT: s_add_u32 s11, s12, s11
-; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10
+; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7
; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14
; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0
-; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
+; GFX1030W64-NEXT: s_add_u32 s7, s11, s7
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12
-; GFX1030W64-NEXT: s_add_u32 s7, s7, s10
-; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11
-; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7
-; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7
-; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6
-; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7
-; GFX1030W64-NEXT: s_add_i32 s8, s10, s8
-; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11
-; GFX1030W64-NEXT: s_add_i32 s8, s8, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11
-; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8
-; GFX1030W64-NEXT: s_add_u32 s9, s9, s14
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11
+; GFX1030W64-NEXT: s_add_u32 s12, s6, s7
+; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12
+; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11
+; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12
+; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6
+; GFX1030W64-NEXT: s_add_i32 s9, s13, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6
+; GFX1030W64-NEXT: s_add_i32 s9, s9, s10
+; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6
+; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9
+; GFX1030W64-NEXT: s_add_u32 s7, s7, s14
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8
-; GFX1030W64-NEXT: s_add_u32 s9, s9, s10
-; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8
-; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12
-; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0
-; GFX1030W64-NEXT: s_add_u32 s8, s9, s8
-; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10
-; GFX1030W64-NEXT: s_add_u32 s7, s7, s8
-; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7
-; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6
-; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6
-; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7
+; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9
+; GFX1030W64-NEXT: s_add_u32 s6, s7, s6
+; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9
+; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11
+; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0
+; GFX1030W64-NEXT: s_add_u32 s6, s6, s9
+; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7
+; GFX1030W64-NEXT: s_add_u32 s10, s12, s6
+; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10
+; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9
+; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10
+; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7
+; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
+; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7
+; GFX1030W64-NEXT: s_add_u32 s8, s10, s8
; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W64-NEXT: s_add_u32 s8, s8, s11
-; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6
-; GFX1030W64-NEXT: s_add_u32 s7, s8, s7
-; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6
-; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9
+; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6
; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0
-; GFX1030W64-NEXT: s_add_u32 s10, s7, s6
+; GFX1030W64-NEXT: s_add_u32 s10, s6, s7
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8
; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10
; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11
; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10
; GFX1030W64-NEXT: s_add_i32 s6, s6, s7
-; GFX1030W64-NEXT: s_add_i32 s8, s6, s8
+; GFX1030W64-NEXT: s_add_i32 s12, s6, s8
; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10
-; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8
-; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6
+; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12
+; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6
; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5
-; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4
-; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5
+; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4
+; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0
+; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5
+; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4
; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
-; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0
-; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5
-; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14
-; GFX1030W64-NEXT: s_add_u32 s13, s10, 1
+; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5
+; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9
+; GFX1030W64-NEXT: s_add_u32 s9, s10, 1
; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0
; GFX1030W64-NEXT: s_add_u32 s15, s10, 2
; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0
-; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13
+; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0
+; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9
; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14
; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8
+; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12
; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5
; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4
+; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0
; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5
; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6
; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0
; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11
-; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10
+; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10
; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3
; GFX1030W64-NEXT: .LBB16_2:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2927,40 +2988,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_u32 s11, s12, s11
; GFX11-NEXT: s_addc_u32 s12, 0, s13
; GFX11-NEXT: s_add_u32 s8, s8, s11
+; GFX11-NEXT: s_cselect_b32 s11, -1, 0
+; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8
+; GFX11-NEXT: s_cmp_lg_u32 s11, 0
+; GFX11-NEXT: s_mul_i32 s11, s9, s8
; GFX11-NEXT: s_addc_u32 s7, s7, s12
-; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX11-NEXT: s_mul_i32 s12, s9, s8
-; GFX11-NEXT: s_mul_i32 s9, s9, s7
; GFX11-NEXT: s_mul_i32 s10, s10, s8
-; GFX11-NEXT: s_add_i32 s9, s11, s9
-; GFX11-NEXT: s_mul_i32 s11, s7, s12
+; GFX11-NEXT: s_mul_i32 s9, s9, s7
+; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11
+; GFX11-NEXT: s_add_i32 s9, s13, s9
+; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11
; GFX11-NEXT: s_add_i32 s9, s9, s10
-; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX11-NEXT: s_mul_i32 s10, s7, s11
; GFX11-NEXT: s_mul_i32 s15, s8, s9
; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX11-NEXT: s_add_u32 s10, s10, s15
-; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12
+; GFX11-NEXT: s_add_u32 s12, s12, s15
; GFX11-NEXT: s_addc_u32 s14, 0, s14
-; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9
-; GFX11-NEXT: s_add_u32 s10, s10, s11
+; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9
+; GFX11-NEXT: s_add_u32 s10, s12, s10
; GFX11-NEXT: s_mul_i32 s9, s7, s9
; GFX11-NEXT: s_addc_u32 s10, s14, s13
-; GFX11-NEXT: s_addc_u32 s11, s12, 0
+; GFX11-NEXT: s_addc_u32 s11, s11, 0
; GFX11-NEXT: s_add_u32 s9, s10, s9
; GFX11-NEXT: s_addc_u32 s10, 0, s11
; GFX11-NEXT: s_add_u32 s8, s8, s9
+; GFX11-NEXT: s_cselect_b32 s9, -1, 0
+; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX11-NEXT: s_addc_u32 s7, s7, s10
-; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX11-NEXT: s_mul_i32 s12, s2, s7
-; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7
-; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8
; GFX11-NEXT: s_mul_i32 s8, s3, s8
-; GFX11-NEXT: s_add_u32 s9, s9, s12
-; GFX11-NEXT: s_addc_u32 s11, 0, s11
+; GFX11-NEXT: s_mul_i32 s12, s2, s7
+; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7
+; GFX11-NEXT: s_add_u32 s11, s11, s12
+; GFX11-NEXT: s_addc_u32 s10, 0, s10
; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX11-NEXT: s_add_u32 s8, s9, s8
+; GFX11-NEXT: s_add_u32 s8, s11, s8
; GFX11-NEXT: s_mul_i32 s7, s3, s7
-; GFX11-NEXT: s_addc_u32 s8, s11, s10
+; GFX11-NEXT: s_addc_u32 s8, s10, s9
; GFX11-NEXT: s_addc_u32 s9, s13, 0
; GFX11-NEXT: s_add_u32 s7, s8, s7
; GFX11-NEXT: s_addc_u32 s8, 0, s9
@@ -2970,14 +3035,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_i32 s9, s9, s10
; GFX11-NEXT: s_mul_i32 s10, s4, s7
; GFX11-NEXT: s_add_i32 s9, s9, s11
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s11, s3, s9
; GFX11-NEXT: s_sub_u32 s10, s2, s10
; GFX11-NEXT: s_cselect_b32 s12, -1, 0
+; GFX11-NEXT: s_cmp_lg_u32 s12, 0
; GFX11-NEXT: s_subb_u32 s11, s11, s5
; GFX11-NEXT: s_sub_u32 s13, s10, s4
+; GFX11-NEXT: s_cselect_b32 s14, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s14, 0
; GFX11-NEXT: s_subb_u32 s11, s11, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_ge_u32 s11, s5
; GFX11-NEXT: s_cselect_b32 s14, -1, 0
; GFX11-NEXT: s_cmp_ge_u32 s13, s4
@@ -3050,8 +3118,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
+; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1250-NEXT: ; %bb.1:
; GFX1250-NEXT: s_cvt_f32_u32 s4, s6
@@ -3086,9 +3155,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s12
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_cmp_lg_u32 s4, 0
; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11
; GFX1250-NEXT: s_mul_i32 s12, s8, s11
; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10
@@ -3103,17 +3175,19 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s10
-; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11
+; GFX1250-NEXT: s_cselect_b32 s10, -1, 0
; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
-; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8
-; GFX1250-NEXT: s_mul_i32 s12, s3, s8
+; GFX1250-NEXT: s_cmp_lg_u32 s10, 0
+; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11
+; GFX1250-NEXT: s_mul_i32 s11, s3, s8
; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10
; GFX1250-NEXT: s_mul_i32 s8, s2, s10
; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9]
; GFX1250-NEXT: s_mul_i32 s10, s3, s10
-; GFX1250-NEXT: s_add_co_u32 s4, s8, s12
-; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11
+; GFX1250-NEXT: s_add_co_u32 s4, s8, s11
+; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12
; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11]
@@ -3128,8 +3202,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_cmp_lg_u32 s8, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7
; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6
+; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_cmp_lg_u32 s14, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_cmp_ge_u32 s12, s7
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
; GFX1250-NEXT: s_cmp_ge_u32 s13, s6
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 07e6a76..4b151b9 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -714,8 +714,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: s_lshl_b32 s2, s2, 8
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: s_lshl_b32 s3, s2, 16
-; VI-NEXT: s_flbit_i32_b32 s3, s3
; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_flbit_i32_b32 s3, s3
+; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, s3, 32
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index fca57be..cefcbdd 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1491,6 +1491,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s4, s6, 16
+; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: s_cbranch_scc0 .LBB14_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mov_b32 s11, 0xf000
@@ -1520,6 +1521,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cbranch_scc0 .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s11, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index dbdea8e..d8a5e7fa 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,6 +14,7 @@ define i32 @s_add_co_select_user() {
; GFX7-NEXT: s_add_u32 s7, s6, s6
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_or_b32 s4, s4, s5
+; GFX7-NEXT: s_cmp_lg_u32 s4, 0
; GFX7-NEXT: s_addc_u32 s8, s6, 0
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -30,6 +31,8 @@ define i32 @s_add_co_select_user() {
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s7, s6, s6
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_addc_u32 s8, s6, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -46,6 +49,8 @@ define i32 @s_add_co_select_user() {
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s5, s4, s4
+; GFX10-NEXT: s_cselect_b32 s6, -1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_addc_u32 s6, s4, 0
; GFX10-NEXT: s_cselect_b32 s7, -1, 0
; GFX10-NEXT: s_and_b32 s7, s7, exec_lo
@@ -62,13 +67,16 @@ define i32 @s_add_co_select_user() {
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s1, s0, s0
+; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_addc_u32 s2, s0, 0
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, 0
; GFX11-NEXT: s_cmp_gt_u32 s0, 31
; GFX11-NEXT: s_cselect_b32 s0, s1, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
@@ -96,6 +104,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-NEXT: s_add_u32 s0, s2, s2
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_or_b32 s0, s0, s1
+; GFX7-NEXT: s_cmp_lg_u32 s0, 0
; GFX7-NEXT: s_addc_u32 s0, s2, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1]
@@ -116,10 +125,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
;
; GFX9-LABEL: s_add_co_br_user:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s1, s0, s0
-; GFX9-NEXT: s_addc_u32 s0, s0, 0
+; GFX9-NEXT: s_add_u32 s0, s2, s2
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_addc_u32 s0, s2, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX9-NEXT: s_cbranch_vccnz .LBB1_2
@@ -142,6 +153,8 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s1, s0, s0
+; GFX10-NEXT: s_cselect_b32 s1, -1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0
@@ -165,9 +178,11 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s1, s0, s0
+; GFX11-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 9a17538..62847b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1117,6 +1117,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s3, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s3, s0
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: s_lshr_b32 s0, s1, 8
@@ -1168,6 +1169,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s3, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s3, s0
+; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; VI-NEXT: s_lshr_b32 s0, s1, 8
@@ -1215,6 +1217,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s3, s0
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
@@ -1261,9 +1264,11 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8
@@ -1315,9 +1320,11 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8
@@ -4016,6 +4023,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; SI-NEXT: s_and_b32 s6, s4, 0xffe
; SI-NEXT: s_and_b32 s4, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s4, s0
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: v_cvt_f16_f32_e32 v0, s5
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
@@ -4058,6 +4066,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; SI-NEXT: s_and_b32 s5, s0, 0xffe
; SI-NEXT: s_and_b32 s0, s3, 0x1ff
; SI-NEXT: s_or_b32 s0, s0, s2
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; SI-NEXT: v_readfirstlane_b32 s0, v2
@@ -4111,9 +4120,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_lshr_b32 s5, s3, 8
-; VI-NEXT: s_and_b32 s5, s5, 0xffe
; VI-NEXT: s_and_b32 s6, s3, 0x1ff
+; VI-NEXT: s_and_b32 s5, s5, 0xffe
; VI-NEXT: s_or_b32 s2, s6, s2
+; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014
@@ -4153,6 +4163,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-NEXT: s_and_b32 s7, s2, 0xffe
; VI-NEXT: s_and_b32 s2, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s2, s0
+; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014
@@ -4198,9 +4209,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s5, s3, 8
-; GFX9-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX9-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-NEXT: s_or_b32 s2, s6, s2
+; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014
@@ -4242,6 +4254,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX9-NEXT: s_and_b32 s6, s2, 0xffe
; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s2, s0
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
@@ -4288,10 +4301,11 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
;
; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-NEXT: s_or_b32 s2, s6, s2
+; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX11-NEXT: s_lshr_b32 s6, s3, 8
+; GFX11-NEXT: s_or_b32 s2, s5, s2
+; GFX11-NEXT: s_and_b32 s5, s6, 0xffe
+; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
@@ -4334,12 +4348,13 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-NEXT: s_cselect_b32 s2, s5, s6
; GFX11-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
; GFX11-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
+; GFX11-NEXT: s_or_b32 s0, s6, s0
; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s0, s6, s0
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index c28b25c7..b0dd187 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -599,8 +599,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s7, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s6, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12
@@ -709,8 +711,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -820,8 +824,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -931,8 +937,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -1110,15 +1118,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1165,15 +1175,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1354,15 +1366,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1409,15 +1423,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -2138,8 +2154,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s9, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9
; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12
@@ -2175,10 +2193,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
+; SI-GISEL-NEXT: s_or_b32 s6, s9, s6
; SI-GISEL-NEXT: s_or_b32 s3, s4, s3
-; SI-GISEL-NEXT: s_or_b32 s4, s9, s6
+; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12
@@ -2335,8 +2355,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; VI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_or_b32 s3, s3, s4
+; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2
; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2370,12 +2392,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
-; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_or_b32 s5, s5, s6
+; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_or_b32 s4, s4, s5
+; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3
; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -2531,8 +2555,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2566,12 +2592,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
-; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -2724,8 +2752,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2759,12 +2789,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
-; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -3041,15 +3073,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3081,17 +3115,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3140,15 +3176,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3180,17 +3218,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3471,15 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3511,17 +3553,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3570,15 +3614,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3610,17 +3656,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 0deef8b..5d31177 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; SI-NEXT: s_and_b32 s1, s7, 0x1ff
; SI-NEXT: s_and_b32 s8, s0, 0xffe
; SI-NEXT: s_or_b32 s0, s1, s6
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014
@@ -236,6 +237,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe
; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff
; VI-SDAG-NEXT: s_or_b32 s4, s4, s6
+; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -288,8 +290,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -331,10 +335,11 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8
-; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
-; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
-; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2
+; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
+; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8
+; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2
+; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
+; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
@@ -382,14 +387,16 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4
; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
@@ -431,10 +438,11 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8
-; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
-; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
-; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2
+; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
+; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2
+; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
@@ -490,15 +498,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 31f277f..37756d1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -472,6 +472,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -535,10 +536,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -604,6 +606,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -657,11 +660,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -706,8 +710,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1685,6 +1690,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1748,10 +1754,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1817,6 +1824,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1870,11 +1878,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1919,8 +1928,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2958,6 +2968,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3021,10 +3032,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3090,6 +3102,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3143,11 +3156,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3192,8 +3206,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3727,6 +3742,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3790,10 +3806,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3859,6 +3876,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3912,11 +3930,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3961,8 +3980,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4999,6 +5019,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5062,10 +5083,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5131,6 +5153,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5184,11 +5207,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5246,8 +5270,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6259,6 +6284,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6328,6 +6354,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6397,6 +6424,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6457,6 +6485,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6520,6 +6550,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7686,6 +7717,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7755,6 +7787,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7824,6 +7857,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7884,6 +7918,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7947,6 +7983,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9113,6 +9150,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9182,6 +9220,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9251,6 +9290,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9311,6 +9351,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9374,6 +9416,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10022,6 +10065,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10091,6 +10135,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10160,6 +10205,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10220,6 +10266,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10283,6 +10331,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11449,6 +11498,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11518,6 +11568,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11587,6 +11638,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11647,6 +11699,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11710,6 +11764,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 4581efc..6351bb3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -381,12 +381,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -456,6 +457,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -511,6 +513,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
@@ -559,7 +562,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -606,9 +610,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1414,12 +1420,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1489,6 +1496,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1544,6 +1552,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1592,7 +1601,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1639,9 +1649,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2447,12 +2459,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2522,6 +2535,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2577,6 +2591,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
@@ -2625,7 +2640,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2672,9 +2688,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3573,6 +3591,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3646,6 +3665,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3704,6 +3724,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3753,7 +3774,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3819,9 +3841,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4836,6 +4859,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4909,6 +4933,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4967,6 +4992,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5016,7 +5042,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5082,9 +5109,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6099,6 +6127,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6172,6 +6201,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6230,6 +6260,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6279,7 +6310,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6345,9 +6377,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index bd570d9..a9ac008 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -381,12 +381,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -456,6 +457,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -511,6 +513,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
@@ -559,7 +562,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -606,9 +610,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1414,12 +1420,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1489,6 +1496,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1544,6 +1552,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1592,7 +1601,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1639,9 +1649,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2447,12 +2459,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2522,6 +2535,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2577,6 +2591,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
@@ -2625,7 +2640,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2672,9 +2688,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3573,6 +3591,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3646,6 +3665,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3704,6 +3724,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3753,7 +3774,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3819,9 +3841,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4836,6 +4859,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4909,6 +4933,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4967,6 +4992,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5016,7 +5042,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5082,9 +5109,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6099,6 +6127,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6172,6 +6201,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6230,6 +6260,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6279,7 +6310,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6345,9 +6377,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 1f2d70c..6311143 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -532,6 +532,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -595,10 +596,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -664,6 +666,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -717,11 +720,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -779,8 +783,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1857,6 +1862,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1920,10 +1926,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1989,6 +1996,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2042,11 +2050,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2104,8 +2113,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3182,6 +3192,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3245,10 +3256,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3314,6 +3326,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3367,11 +3380,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3429,8 +3443,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4003,6 +4018,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4066,10 +4082,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4135,6 +4152,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4188,11 +4206,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4250,8 +4269,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5327,6 +5347,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5390,10 +5411,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5459,6 +5481,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5512,11 +5535,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5574,8 +5598,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6587,6 +6612,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6656,6 +6682,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6725,6 +6752,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6785,6 +6813,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6848,6 +6878,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8013,6 +8044,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8082,6 +8114,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8151,6 +8184,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8211,6 +8245,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8274,6 +8310,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9440,6 +9477,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9509,6 +9547,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9578,6 +9617,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9638,6 +9678,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9701,6 +9743,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10349,6 +10392,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10418,6 +10462,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10487,6 +10532,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10547,6 +10593,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10610,6 +10658,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11775,6 +11824,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11844,6 +11894,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11913,6 +11964,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11973,6 +12025,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12036,6 +12090,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index c3f3917..eee232a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -136,17 +136,19 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: .LBB2_6: ; %bb18
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_cselect_b32 s13, -1, 0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
-; GFX11-NEXT: s_and_b32 s13, s8, s13
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_and_b32 s13, s13, exec_lo
+; GFX11-NEXT: v_readfirstlane_b32 s13, v0
+; GFX11-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT: s_and_b32 s1, s8, s1
+; GFX11-NEXT: s_and_b32 s1, s1, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
-; GFX11-NEXT: s_cselect_b32 s1, s19, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s1, s1, 1
+; GFX11-NEXT: s_cselect_b32 s1, s19, s13
; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
+; GFX11-NEXT: s_and_b32 s1, s1, 1
+; GFX11-NEXT: s_cmp_lg_u32 s13, 0
; GFX11-NEXT: s_cselect_b32 s13, -1, 0
; GFX11-NEXT: s_and_b32 s20, s9, exec_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 6dc9199..8748aff 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -8265,10 +8265,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
+; GFX12-NEXT: s_lshl_b32 s7, 1, s3
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
-; GFX12-NEXT: s_lshl_b32 s3, 1, s3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
+; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB28_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
@@ -8349,13 +8351,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX942-NEXT: .LBB28_5: ; %ComputeLoop
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT: v_readfirstlane_b32 s6, v1
-; GFX942-NEXT: s_mov_b32 m0, s3
-; GFX942-NEXT: v_readlane_b32 s8, v2, s3
-; GFX942-NEXT: v_writelane_b32 v0, s6, m0
; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX942-NEXT: v_readfirstlane_b32 s8, v1
+; GFX942-NEXT: v_readlane_b32 s9, v2, s3
+; GFX942-NEXT: s_mov_b32 m0, s3
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX942-NEXT: v_writelane_b32 v0, s8, m0
+; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
; GFX942-NEXT: s_cbranch_scc1 .LBB28_5
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8437,14 +8440,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX11-NEXT: .LBB28_5: ; %ComputeLoop
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
+; GFX11-NEXT: s_lshl_b32 s7, 1, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT: s_lshl_b32 s1, 1, s1
-; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc1 .LBB28_5
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8524,10 +8528,11 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
+; GFX10-NEXT: s_lshl_b32 s7, 1, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s7
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT: s_lshl_b32 s1, 1, s1
-; GFX10-NEXT: s_andn2_b32 s0, s0, s1
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc1 .LBB28_5
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8604,13 +8609,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
-; GFX90A-NEXT: s_mov_b32 m0, s3
-; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
-; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
+; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT: s_mov_b32 m0, s3
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
+; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8686,13 +8692,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX908-NEXT: .LBB28_5: ; %ComputeLoop
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT: v_readfirstlane_b32 s6, v1
-; GFX908-NEXT: s_mov_b32 m0, s3
-; GFX908-NEXT: v_readlane_b32 s8, v2, s3
-; GFX908-NEXT: v_writelane_b32 v0, s6, m0
; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX908-NEXT: v_readfirstlane_b32 s8, v1
+; GFX908-NEXT: v_readlane_b32 s9, v2, s3
+; GFX908-NEXT: s_mov_b32 m0, s3
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX908-NEXT: v_writelane_b32 v0, s8, m0
+; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
; GFX908-NEXT: s_cbranch_scc1 .LBB28_5
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8769,13 +8776,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX8-NEXT: .LBB28_5: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: v_readfirstlane_b32 s6, v1
-; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v2, s3
-; GFX8-NEXT: v_writelane_b32 v0, s6, m0
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s8, v1
+; GFX8-NEXT: v_readlane_b32 s9, v2, s3
+; GFX8-NEXT: s_mov_b32 m0, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX8-NEXT: v_writelane_b32 v0, s8, m0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
; GFX8-NEXT: s_cbranch_scc1 .LBB28_5
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9122,10 +9130,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
+; GFX12-NEXT: s_lshl_b32 s7, 1, s3
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
-; GFX12-NEXT: s_lshl_b32 s3, 1, s3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
+; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB29_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
@@ -9202,13 +9212,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX942-NEXT: .LBB29_5: ; %ComputeLoop
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT: v_readfirstlane_b32 s6, v1
-; GFX942-NEXT: s_mov_b32 m0, s3
-; GFX942-NEXT: v_readlane_b32 s8, v2, s3
-; GFX942-NEXT: v_writelane_b32 v0, s6, m0
; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX942-NEXT: v_readfirstlane_b32 s8, v1
+; GFX942-NEXT: v_readlane_b32 s9, v2, s3
+; GFX942-NEXT: s_mov_b32 m0, s3
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX942-NEXT: v_writelane_b32 v0, s8, m0
+; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
; GFX942-NEXT: s_cbranch_scc1 .LBB29_5
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9285,14 +9296,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: .LBB29_5: ; %ComputeLoop
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
+; GFX11-NEXT: s_lshl_b32 s7, 1, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT: s_lshl_b32 s1, 1, s1
-; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc1 .LBB29_5
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9365,10 +9377,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
+; GFX10-NEXT: s_lshl_b32 s7, 1, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s7
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT: s_lshl_b32 s1, 1, s1
-; GFX10-NEXT: s_andn2_b32 s0, s0, s1
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc1 .LBB29_5
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9440,13 +9453,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
-; GFX90A-NEXT: s_mov_b32 m0, s3
-; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
-; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
+; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT: s_mov_b32 m0, s3
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
+; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9519,13 +9533,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX908-NEXT: .LBB29_5: ; %ComputeLoop
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT: v_readfirstlane_b32 s6, v1
-; GFX908-NEXT: s_mov_b32 m0, s3
-; GFX908-NEXT: v_readlane_b32 s8, v2, s3
-; GFX908-NEXT: v_writelane_b32 v0, s6, m0
; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX908-NEXT: v_readfirstlane_b32 s8, v1
+; GFX908-NEXT: v_readlane_b32 s9, v2, s3
+; GFX908-NEXT: s_mov_b32 m0, s3
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX908-NEXT: v_writelane_b32 v0, s8, m0
+; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
; GFX908-NEXT: s_cbranch_scc1 .LBB29_5
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9599,13 +9614,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: .LBB29_5: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: v_readfirstlane_b32 s6, v1
-; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v2, s3
-; GFX8-NEXT: v_writelane_b32 v0, s6, m0
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s8, v1
+; GFX8-NEXT: v_readlane_b32 s9, v2, s3
+; GFX8-NEXT: s_mov_b32 m0, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX8-NEXT: v_writelane_b32 v0, s8, m0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
; GFX8-NEXT: s_cbranch_scc1 .LBB29_5
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index fba42c4..c1cf06e 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -388,8 +388,9 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
- ; GCN-NEXT: S_NOP 0, implicit $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit killed $scc
+ ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
@@ -416,80 +417,6 @@ body: |
S_ENDPGM 0
...
----
-name: xor_1_cmp_lg_0_killed_scc
-body: |
- ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc
- ; GCN: bb.0:
- ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc
- ; GCN-NEXT: S_NOP 0, implicit $scc
- ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
- ; GCN-NEXT: S_BRANCH %bb.1
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.1:
- ; GCN-NEXT: successors: %bb.2(0x80000000)
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.2:
- ; GCN-NEXT: S_ENDPGM 0
- bb.0:
- successors: %bb.1(0x40000000), %bb.2(0x40000000)
- liveins: $sgpr0, $vgpr0_vgpr1
-
- %0:sreg_32 = COPY $sgpr0
- %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc
- S_NOP 0, implicit killed $scc
- S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
- S_CBRANCH_SCC0 %bb.2, implicit $scc
- S_BRANCH %bb.1
-
- bb.1:
- successors: %bb.2(0x80000000)
-
- bb.2:
- S_ENDPGM 0
-
-...
----
-name: absdiff_1_cmp_lg_0_killed_scc
-body: |
- ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc
- ; GCN: bb.0:
- ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc
- ; GCN-NEXT: S_NOP 0, implicit $scc
- ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
- ; GCN-NEXT: S_BRANCH %bb.1
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.1:
- ; GCN-NEXT: successors: %bb.2(0x80000000)
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.2:
- ; GCN-NEXT: S_ENDPGM 0
- bb.0:
- successors: %bb.1(0x40000000), %bb.2(0x40000000)
- liveins: $sgpr0, $vgpr0_vgpr1
-
- %0:sreg_32 = COPY $sgpr0
- %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc
- S_NOP 0, implicit killed $scc
- S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
- S_CBRANCH_SCC0 %bb.2, implicit $scc
- S_BRANCH %bb.1
-
- bb.1:
- successors: %bb.2(0x80000000)
-
- bb.2:
- S_ENDPGM 0
-
-...
---
name: and_1_cmp_eq_1_clobbered_scc
@@ -2143,7 +2070,8 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc
+ ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index 19cc7f7..f53aaaa 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -10,6 +10,7 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: shl32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -24,6 +25,7 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: shl64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -38,6 +40,7 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: lshr32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -52,6 +55,7 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: lshr64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -66,6 +70,7 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: ashr32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_ashr_i32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -80,6 +85,7 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: ashr64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -94,6 +100,7 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
; CHECK-LABEL: abs32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_abs_i32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -114,6 +121,7 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: and32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -128,6 +136,7 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: and64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -142,6 +151,7 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: or32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -156,6 +166,7 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: or64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -170,6 +181,7 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: xor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -184,6 +196,7 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: xor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -198,6 +211,7 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: nand32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nand_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -217,6 +231,7 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nand64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -236,6 +251,7 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: nor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -255,6 +271,7 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -274,6 +291,7 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: xnor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xnor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -293,6 +311,7 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: xnor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -312,6 +331,7 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: andn232:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_andn2_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -327,6 +347,7 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nandn264:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -342,6 +363,7 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: orn232:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_orn2_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -357,6 +379,7 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: orn264:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -372,6 +395,7 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
; CHECK-LABEL: bfe_i32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -409,6 +433,7 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
; CHECK-LABEL: bfe_u32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -488,6 +513,7 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
; CHECK-LABEL: bcnt132:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -526,6 +552,7 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
; CHECK-LABEL: quadmask32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_quadmask_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -544,6 +571,7 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
; CHECK-LABEL: quadmask64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -562,6 +590,7 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) {
; CHECK-LABEL: not32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_not_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -580,6 +609,7 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
; CHECK-LABEL: not64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_not_b64 s[0:1], s[0:1]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
index 7552f6b..a828ee0 100644
--- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
@@ -12,6 +12,8 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) {
; CHECK-LABEL: s_uaddo_pseudo:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_addc_u32 s0, 1, 0
; CHECK-NEXT: ; return to shader part epilog
%pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1)
@@ -30,6 +32,8 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: s_usubo_pseudo:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s0, s1, 0
; CHECK-NEXT: ; return to shader part epilog
%pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 71f5a94..5f6d622 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -56,9 +56,10 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_addc_u32 s15, 0, s16
; GCN-NEXT: s_add_u32 s16, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mul_hi_u32 v0, s12, v0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s12, v0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s14, s14, s15
; GCN-NEXT: s_mul_i32 s0, s12, s14
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -89,6 +90,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_add_u32 s15, s16, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s14, s14, s12
; GCN-NEXT: s_ashr_i32 s12, s7, 31
; GCN-NEXT: s_add_u32 s0, s6, s12
@@ -114,50 +116,52 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_addc_u32 s4, s4, 0
; GCN-NEXT: s_mul_i32 s14, s7, s14
-; GCN-NEXT: s_add_u32 s16, s1, s14
-; GCN-NEXT: v_mov_b32_e32 v0, s16
+; GCN-NEXT: s_add_u32 s14, s1, s14
+; GCN-NEXT: v_mov_b32_e32 v0, s14
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_addc_u32 s17, 0, s4
+; GCN-NEXT: s_addc_u32 s15, 0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mul_i32 s4, s10, s17
+; GCN-NEXT: s_mul_i32 s4, s10, s15
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s5, s11, s16
-; GCN-NEXT: s_add_i32 s18, s4, s5
-; GCN-NEXT: s_sub_i32 s14, s7, s18
-; GCN-NEXT: s_mul_i32 s4, s10, s16
+; GCN-NEXT: s_mul_i32 s5, s11, s14
+; GCN-NEXT: s_add_i32 s16, s4, s5
+; GCN-NEXT: s_sub_i32 s17, s7, s16
+; GCN-NEXT: s_mul_i32 s4, s10, s14
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s15, s4, s5
-; GCN-NEXT: s_subb_u32 s19, s14, s11
-; GCN-NEXT: s_sub_u32 s20, s6, s10
-; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_or_b32 s14, s14, s15
-; GCN-NEXT: s_subb_u32 s14, s19, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s11
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s20, s10
-; GCN-NEXT: s_cselect_b32 s19, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s11
-; GCN-NEXT: s_cselect_b32 s14, s19, s15
-; GCN-NEXT: s_add_u32 s15, s16, 1
-; GCN-NEXT: s_addc_u32 s19, s17, 0
-; GCN-NEXT: s_add_u32 s20, s16, 2
-; GCN-NEXT: s_addc_u32 s21, s17, 0
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_cselect_b32 s14, s20, s15
-; GCN-NEXT: s_cselect_b32 s15, s21, s19
+; GCN-NEXT: s_or_b32 s18, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s18, 0
+; GCN-NEXT: s_subb_u32 s17, s17, s11
+; GCN-NEXT: s_sub_u32 s19, s6, s10
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_subb_u32 s4, s7, s18
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s4, s17, 0
; GCN-NEXT: s_cmp_ge_u32 s4, s11
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s6, s10
-; GCN-NEXT: s_cselect_b32 s6, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s19, s10
+; GCN-NEXT: s_cselect_b32 s17, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s4, s11
-; GCN-NEXT: s_cselect_b32 s4, s6, s5
+; GCN-NEXT: s_cselect_b32 s4, s17, s5
+; GCN-NEXT: s_add_u32 s5, s14, 1
+; GCN-NEXT: s_addc_u32 s17, s15, 0
+; GCN-NEXT: s_add_u32 s19, s14, 2
+; GCN-NEXT: s_addc_u32 s20, s15, 0
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s5, s15, s17
-; GCN-NEXT: s_cselect_b32 s4, s14, s16
+; GCN-NEXT: s_cselect_b32 s4, s19, s5
+; GCN-NEXT: s_cselect_b32 s5, s20, s17
+; GCN-NEXT: s_cmp_lg_u32 s18, 0
+; GCN-NEXT: s_subb_u32 s7, s7, s16
+; GCN-NEXT: s_cmp_ge_u32 s7, s11
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s6, s10
+; GCN-NEXT: s_cselect_b32 s6, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s7, s11
+; GCN-NEXT: s_cselect_b32 s6, s6, s16
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-NEXT: s_cselect_b32 s5, s5, s15
+; GCN-NEXT: s_cselect_b32 s4, s4, s14
; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GCN-NEXT: s_sub_u32 s4, s4, s6
@@ -204,6 +208,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s18, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
+; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
; GCN-IR-NEXT: s_addc_u32 s10, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
@@ -237,6 +242,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
+; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9]
@@ -1189,9 +1195,10 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s12, 0, s13
; GCN-NEXT: s_add_u32 s13, s8, s9
; GCN-NEXT: v_mov_b32_e32 v0, s13
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s11, s11, s12
; GCN-NEXT: s_mul_i32 s8, s2, s11
; GCN-NEXT: v_readfirstlane_b32 s9, v0
@@ -1222,6 +1229,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s2, s13, s2
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s8, s11, s10
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s8, 24
@@ -1230,46 +1238,48 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_readfirstlane_b32 s10, v1
; GCN-NEXT: v_readfirstlane_b32 s9, v0
; GCN-NEXT: s_add_u32 s8, s10, s8
-; GCN-NEXT: s_addc_u32 s12, 0, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NEXT: s_addc_u32 s10, 0, s9
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_mul_i32 s8, s7, s12
+; GCN-NEXT: s_mul_i32 s8, s7, s10
; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s13, s9, s8
-; GCN-NEXT: s_sub_i32 s10, 0, s13
-; GCN-NEXT: s_mul_i32 s8, s6, s12
-; GCN-NEXT: s_sub_u32 s14, 24, s8
+; GCN-NEXT: s_add_i32 s11, s9, s8
+; GCN-NEXT: s_sub_i32 s12, 0, s11
+; GCN-NEXT: s_mul_i32 s8, s6, s10
+; GCN-NEXT: s_sub_u32 s13, 24, s8
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s14, s8, s9
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_subb_u32 s12, s12, s7
+; GCN-NEXT: s_sub_u32 s15, s13, s6
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s11, s8, s9
-; GCN-NEXT: s_subb_u32 s15, s10, s7
-; GCN-NEXT: s_sub_u32 s16, s14, s6
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s15, 0
-; GCN-NEXT: s_cmp_ge_u32 s10, s7
-; GCN-NEXT: s_cselect_b32 s11, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s16, s6
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s10, s7
-; GCN-NEXT: s_cselect_b32 s10, s15, s11
-; GCN-NEXT: s_add_u32 s11, s12, 1
-; GCN-NEXT: s_addc_u32 s15, 0, 0
-; GCN-NEXT: s_add_u32 s16, s12, 2
-; GCN-NEXT: s_addc_u32 s17, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_cselect_b32 s10, s16, s11
-; GCN-NEXT: s_cselect_b32 s11, s17, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, 0, s13
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_subb_u32 s8, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s8, s7
; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s6
-; GCN-NEXT: s_cselect_b32 s6, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s15, s6
+; GCN-NEXT: s_cselect_b32 s12, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s8, s7
-; GCN-NEXT: s_cselect_b32 s6, s6, s9
+; GCN-NEXT: s_cselect_b32 s8, s12, s9
+; GCN-NEXT: s_add_u32 s9, s10, 1
+; GCN-NEXT: s_addc_u32 s12, 0, 0
+; GCN-NEXT: s_add_u32 s15, s10, 2
+; GCN-NEXT: s_addc_u32 s16, 0, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s8, s15, s9
+; GCN-NEXT: s_cselect_b32 s9, s16, s12
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_subb_u32 s11, 0, s11
+; GCN-NEXT: s_cmp_ge_u32 s11, s7
+; GCN-NEXT: s_cselect_b32 s12, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s6
+; GCN-NEXT: s_cselect_b32 s6, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s11, s7
+; GCN-NEXT: s_cselect_b32 s6, s6, s12
; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s7, s11, 0
-; GCN-NEXT: s_cselect_b32 s6, s10, s12
+; GCN-NEXT: s_cselect_b32 s7, s9, 0
+; GCN-NEXT: s_cselect_b32 s6, s8, s10
; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_subb_u32 s7, s7, s4
@@ -1305,6 +1315,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s12, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
@@ -1337,6 +1348,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
+; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index e12e31b..bbd1793 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
; GCN-NEXT: s_sub_u32 s3, 0, s8
-; GCN-NEXT: s_subb_u32 s10, 0, s9
+; GCN-NEXT: s_subb_u32 s12, 0, s9
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1522,52 +1522,56 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s11, v1
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s13, s3, s11
-; GCN-NEXT: s_mul_hi_u32 s15, s3, s12
-; GCN-NEXT: s_mul_i32 s14, s10, s12
-; GCN-NEXT: s_add_i32 s13, s15, s13
-; GCN-NEXT: s_add_i32 s13, s13, s14
-; GCN-NEXT: s_mul_i32 s16, s3, s12
-; GCN-NEXT: s_mul_i32 s15, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s17, s12, s16
-; GCN-NEXT: s_mul_hi_u32 s14, s12, s13
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: v_readfirstlane_b32 s10, v0
+; GCN-NEXT: s_mul_i32 s11, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s15, s3, s10
+; GCN-NEXT: s_mul_i32 s14, s12, s10
+; GCN-NEXT: s_add_i32 s11, s15, s11
+; GCN-NEXT: s_add_i32 s11, s11, s14
+; GCN-NEXT: s_mul_i32 s16, s3, s10
+; GCN-NEXT: s_mul_i32 s15, s10, s11
+; GCN-NEXT: s_mul_hi_u32 s17, s10, s16
+; GCN-NEXT: s_mul_hi_u32 s14, s10, s11
; GCN-NEXT: s_add_u32 s15, s17, s15
; GCN-NEXT: s_addc_u32 s14, 0, s14
-; GCN-NEXT: s_mul_hi_u32 s18, s11, s16
-; GCN-NEXT: s_mul_i32 s16, s11, s16
+; GCN-NEXT: s_mul_hi_u32 s18, s13, s16
+; GCN-NEXT: s_mul_i32 s16, s13, s16
; GCN-NEXT: s_add_u32 s15, s15, s16
-; GCN-NEXT: s_mul_hi_u32 s17, s11, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s13, s11
; GCN-NEXT: s_addc_u32 s14, s14, s18
; GCN-NEXT: s_addc_u32 s15, s17, 0
-; GCN-NEXT: s_mul_i32 s13, s11, s13
-; GCN-NEXT: s_add_u32 s13, s14, s13
+; GCN-NEXT: s_mul_i32 s11, s13, s11
+; GCN-NEXT: s_add_u32 s11, s14, s11
; GCN-NEXT: s_addc_u32 s14, 0, s15
-; GCN-NEXT: s_add_u32 s12, s12, s13
-; GCN-NEXT: s_addc_u32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s13, s3, s11
-; GCN-NEXT: s_mul_hi_u32 s14, s3, s12
-; GCN-NEXT: s_add_i32 s13, s14, s13
-; GCN-NEXT: s_mul_i32 s10, s10, s12
-; GCN-NEXT: s_add_i32 s13, s13, s10
-; GCN-NEXT: s_mul_i32 s3, s3, s12
-; GCN-NEXT: s_mul_hi_u32 s14, s11, s3
-; GCN-NEXT: s_mul_i32 s15, s11, s3
-; GCN-NEXT: s_mul_i32 s17, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s3, s12, s3
-; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT: s_add_u32 s15, s10, s11
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GCN-NEXT: s_addc_u32 s13, s13, s14
+; GCN-NEXT: s_mul_i32 s10, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s11, s3, s15
+; GCN-NEXT: s_add_i32 s10, s11, s10
+; GCN-NEXT: s_mul_i32 s12, s12, s15
+; GCN-NEXT: s_add_i32 s10, s10, s12
+; GCN-NEXT: s_mul_i32 s3, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s12, s13, s3
+; GCN-NEXT: s_mul_i32 s14, s13, s3
+; GCN-NEXT: s_mul_i32 s17, s15, s10
+; GCN-NEXT: s_mul_hi_u32 s3, s15, s3
+; GCN-NEXT: s_mul_hi_u32 s16, s15, s10
; GCN-NEXT: s_add_u32 s3, s3, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_add_u32 s3, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s10, s11, s13
-; GCN-NEXT: s_addc_u32 s3, s16, s14
-; GCN-NEXT: s_addc_u32 s10, s10, 0
-; GCN-NEXT: s_mul_i32 s13, s11, s13
-; GCN-NEXT: s_add_u32 s3, s3, s13
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: s_add_u32 s3, s12, s3
-; GCN-NEXT: s_addc_u32 s14, s11, s10
+; GCN-NEXT: s_add_u32 s3, s3, s14
+; GCN-NEXT: s_mul_hi_u32 s11, s13, s10
+; GCN-NEXT: s_addc_u32 s3, s16, s12
+; GCN-NEXT: s_addc_u32 s11, s11, 0
+; GCN-NEXT: s_mul_i32 s10, s13, s10
+; GCN-NEXT: s_add_u32 s3, s3, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s11
+; GCN-NEXT: s_add_u32 s3, s15, s3
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GCN-NEXT: s_addc_u32 s14, s13, s12
; GCN-NEXT: s_ashr_i32 s10, s5, 31
; GCN-NEXT: s_add_u32 s12, s4, s10
; GCN-NEXT: s_mov_b32 s11, s10
@@ -1596,9 +1600,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_mul_i32 s3, s8, s3
; GCN-NEXT: s_sub_u32 s3, s12, s3
; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s12, s16, s9
; GCN-NEXT: s_sub_u32 s18, s3, s8
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s19, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s19, s9
; GCN-NEXT: s_cselect_b32 s20, -1, 0
@@ -1608,10 +1614,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_cselect_b32 s20, s21, s20
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s12, s12, s9
-; GCN-NEXT: s_sub_u32 s16, s18, s8
+; GCN-NEXT: s_sub_u32 s21, s18, s8
+; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s12, s12, 0
; GCN-NEXT: s_cmp_lg_u32 s20, 0
-; GCN-NEXT: s_cselect_b32 s16, s16, s18
+; GCN-NEXT: s_cselect_b32 s16, s21, s18
; GCN-NEXT: s_cselect_b32 s12, s12, s19
; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s5, s13, s5
@@ -1923,9 +1931,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s3, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -1935,10 +1945,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
-; TONGA-NEXT: s_sub_u32 s16, s18, s6
+; TONGA-NEXT: s_sub_u32 s21, s18, s6
+; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s3, s3, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s16, s18
+; TONGA-NEXT: s_cselect_b32 s16, s21, s18
; TONGA-NEXT: s_cselect_b32 s3, s3, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s5, s13, s5
@@ -2718,7 +2730,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s9, 0, s6
-; GCN-NEXT: s_subb_u32 s14, 0, s7
+; GCN-NEXT: s_subb_u32 s16, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2727,52 +2739,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s15, v1
-; GCN-NEXT: v_readfirstlane_b32 s16, v0
-; GCN-NEXT: s_mul_i32 s17, s9, s15
-; GCN-NEXT: s_mul_hi_u32 s19, s9, s16
-; GCN-NEXT: s_mul_i32 s18, s14, s16
-; GCN-NEXT: s_add_i32 s17, s19, s17
-; GCN-NEXT: s_add_i32 s17, s17, s18
-; GCN-NEXT: s_mul_i32 s20, s9, s16
-; GCN-NEXT: s_mul_i32 s19, s16, s17
-; GCN-NEXT: s_mul_hi_u32 s21, s16, s20
-; GCN-NEXT: s_mul_hi_u32 s18, s16, s17
+; GCN-NEXT: v_readfirstlane_b32 s17, v1
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s15, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s19, s9, s14
+; GCN-NEXT: s_mul_i32 s18, s16, s14
+; GCN-NEXT: s_add_i32 s15, s19, s15
+; GCN-NEXT: s_add_i32 s15, s15, s18
+; GCN-NEXT: s_mul_i32 s20, s9, s14
+; GCN-NEXT: s_mul_i32 s19, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s21, s14, s20
+; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
; GCN-NEXT: s_add_u32 s19, s21, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_mul_hi_u32 s22, s15, s20
-; GCN-NEXT: s_mul_i32 s20, s15, s20
+; GCN-NEXT: s_mul_hi_u32 s22, s17, s20
+; GCN-NEXT: s_mul_i32 s20, s17, s20
; GCN-NEXT: s_add_u32 s19, s19, s20
-; GCN-NEXT: s_mul_hi_u32 s21, s15, s17
+; GCN-NEXT: s_mul_hi_u32 s21, s17, s15
; GCN-NEXT: s_addc_u32 s18, s18, s22
; GCN-NEXT: s_addc_u32 s19, s21, 0
-; GCN-NEXT: s_mul_i32 s17, s15, s17
-; GCN-NEXT: s_add_u32 s17, s18, s17
+; GCN-NEXT: s_mul_i32 s15, s17, s15
+; GCN-NEXT: s_add_u32 s15, s18, s15
; GCN-NEXT: s_addc_u32 s18, 0, s19
-; GCN-NEXT: s_add_u32 s16, s16, s17
-; GCN-NEXT: s_addc_u32 s15, s15, s18
-; GCN-NEXT: s_mul_i32 s17, s9, s15
-; GCN-NEXT: s_mul_hi_u32 s18, s9, s16
-; GCN-NEXT: s_add_i32 s17, s18, s17
-; GCN-NEXT: s_mul_i32 s14, s14, s16
-; GCN-NEXT: s_add_i32 s17, s17, s14
-; GCN-NEXT: s_mul_i32 s9, s9, s16
-; GCN-NEXT: s_mul_hi_u32 s18, s15, s9
-; GCN-NEXT: s_mul_i32 s19, s15, s9
-; GCN-NEXT: s_mul_i32 s21, s16, s17
-; GCN-NEXT: s_mul_hi_u32 s9, s16, s9
-; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
+; GCN-NEXT: s_add_u32 s19, s14, s15
+; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
+; GCN-NEXT: s_addc_u32 s17, s17, s18
+; GCN-NEXT: s_mul_i32 s14, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s15, s9, s19
+; GCN-NEXT: s_add_i32 s14, s15, s14
+; GCN-NEXT: s_mul_i32 s16, s16, s19
+; GCN-NEXT: s_add_i32 s14, s14, s16
+; GCN-NEXT: s_mul_i32 s9, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s16, s17, s9
+; GCN-NEXT: s_mul_i32 s18, s17, s9
+; GCN-NEXT: s_mul_i32 s21, s19, s14
+; GCN-NEXT: s_mul_hi_u32 s9, s19, s9
+; GCN-NEXT: s_mul_hi_u32 s20, s19, s14
; GCN-NEXT: s_add_u32 s9, s9, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_add_u32 s9, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s14, s15, s17
-; GCN-NEXT: s_addc_u32 s9, s20, s18
-; GCN-NEXT: s_addc_u32 s14, s14, 0
-; GCN-NEXT: s_mul_i32 s17, s15, s17
-; GCN-NEXT: s_add_u32 s9, s9, s17
-; GCN-NEXT: s_addc_u32 s14, 0, s14
-; GCN-NEXT: s_add_u32 s9, s16, s9
-; GCN-NEXT: s_addc_u32 s18, s15, s14
+; GCN-NEXT: s_add_u32 s9, s9, s18
+; GCN-NEXT: s_mul_hi_u32 s15, s17, s14
+; GCN-NEXT: s_addc_u32 s9, s20, s16
+; GCN-NEXT: s_addc_u32 s15, s15, 0
+; GCN-NEXT: s_mul_i32 s14, s17, s14
+; GCN-NEXT: s_add_u32 s9, s9, s14
+; GCN-NEXT: s_addc_u32 s16, 0, s15
+; GCN-NEXT: s_add_u32 s9, s19, s9
+; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
+; GCN-NEXT: s_addc_u32 s18, s17, s16
; GCN-NEXT: s_ashr_i32 s14, s11, 31
; GCN-NEXT: s_add_u32 s16, s10, s14
; GCN-NEXT: s_mov_b32 s15, s14
@@ -2801,9 +2817,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s6, s9
; GCN-NEXT: s_sub_u32 s9, s16, s9
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s16, s20, s7
; GCN-NEXT: s_sub_u32 s22, s9, s6
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s23, s16, 0
; GCN-NEXT: s_cmp_ge_u32 s23, s7
; GCN-NEXT: s_cselect_b32 s24, -1, 0
@@ -2813,10 +2831,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s24, s25, s24
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s16, s16, s7
-; GCN-NEXT: s_sub_u32 s20, s22, s6
+; GCN-NEXT: s_sub_u32 s25, s22, s6
+; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s16, s16, 0
; GCN-NEXT: s_cmp_lg_u32 s24, 0
-; GCN-NEXT: s_cselect_b32 s20, s20, s22
+; GCN-NEXT: s_cselect_b32 s20, s25, s22
; GCN-NEXT: s_cselect_b32 s16, s16, s23
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s11, s17, s11
@@ -2867,7 +2887,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s12, 0, s11
+; GCN-NEXT: s_subb_u32 s14, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2876,52 +2896,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
-; GCN-NEXT: s_mul_i32 s16, s12, s14
-; GCN-NEXT: s_add_i32 s15, s17, s15
-; GCN-NEXT: s_add_i32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s14
-; GCN-NEXT: s_mul_i32 s17, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s13, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
+; GCN-NEXT: s_mul_i32 s16, s14, s12
+; GCN-NEXT: s_add_i32 s13, s17, s13
+; GCN-NEXT: s_add_i32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s12
+; GCN-NEXT: s_mul_i32 s17, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
-; GCN-NEXT: s_mul_i32 s18, s13, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
+; GCN-NEXT: s_mul_i32 s18, s15, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s13, s15, s13
+; GCN-NEXT: s_add_u32 s13, s16, s13
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s14, s14, s15
-; GCN-NEXT: s_addc_u32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
-; GCN-NEXT: s_add_i32 s15, s16, s15
-; GCN-NEXT: s_mul_i32 s12, s12, s14
-; GCN-NEXT: s_add_i32 s15, s15, s12
-; GCN-NEXT: s_mul_i32 s3, s3, s14
-; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
-; GCN-NEXT: s_mul_i32 s17, s13, s3
-; GCN-NEXT: s_mul_i32 s19, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT: s_add_u32 s17, s12, s13
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s12, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s14, s14, s17
+; GCN-NEXT: s_add_i32 s12, s12, s14
+; GCN-NEXT: s_mul_i32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
+; GCN-NEXT: s_mul_i32 s16, s15, s3
+; GCN-NEXT: s_mul_i32 s19, s17, s12
+; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
-; GCN-NEXT: s_addc_u32 s3, s18, s16
-; GCN-NEXT: s_addc_u32 s12, s12, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s3, s3, s15
-; GCN-NEXT: s_addc_u32 s12, 0, s12
-; GCN-NEXT: s_add_u32 s3, s14, s3
-; GCN-NEXT: s_addc_u32 s16, s13, s12
+; GCN-NEXT: s_add_u32 s3, s3, s16
+; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
+; GCN-NEXT: s_addc_u32 s3, s18, s14
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s12, s15, s12
+; GCN-NEXT: s_add_u32 s3, s3, s12
+; GCN-NEXT: s_addc_u32 s14, 0, s13
+; GCN-NEXT: s_add_u32 s3, s17, s3
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s16, s15, s14
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -2950,9 +2974,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -2962,10 +2988,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s18, s20, s10
+; GCN-NEXT: s_sub_u32 s23, s20, s10
+; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s18, s20
+; GCN-NEXT: s_cselect_b32 s18, s23, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -3435,9 +3463,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -3447,10 +3477,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s16, s18, s6
+; TONGA-NEXT: s_sub_u32 s21, s18, s6
+; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s16, s18
+; TONGA-NEXT: s_cselect_b32 s16, s21, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
@@ -4902,7 +4934,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s17, 0, s6
-; GCN-NEXT: s_subb_u32 s22, 0, s7
+; GCN-NEXT: s_subb_u32 s24, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -4911,52 +4943,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s23, v1
-; GCN-NEXT: v_readfirstlane_b32 s24, v0
-; GCN-NEXT: s_mul_i32 s25, s17, s23
-; GCN-NEXT: s_mul_hi_u32 s27, s17, s24
-; GCN-NEXT: s_mul_i32 s26, s22, s24
-; GCN-NEXT: s_add_i32 s25, s27, s25
-; GCN-NEXT: s_add_i32 s25, s25, s26
-; GCN-NEXT: s_mul_i32 s28, s17, s24
-; GCN-NEXT: s_mul_i32 s27, s24, s25
-; GCN-NEXT: s_mul_hi_u32 s29, s24, s28
-; GCN-NEXT: s_mul_hi_u32 s26, s24, s25
+; GCN-NEXT: v_readfirstlane_b32 s25, v1
+; GCN-NEXT: v_readfirstlane_b32 s22, v0
+; GCN-NEXT: s_mul_i32 s23, s17, s25
+; GCN-NEXT: s_mul_hi_u32 s27, s17, s22
+; GCN-NEXT: s_mul_i32 s26, s24, s22
+; GCN-NEXT: s_add_i32 s23, s27, s23
+; GCN-NEXT: s_add_i32 s23, s23, s26
+; GCN-NEXT: s_mul_i32 s28, s17, s22
+; GCN-NEXT: s_mul_i32 s27, s22, s23
+; GCN-NEXT: s_mul_hi_u32 s29, s22, s28
+; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
; GCN-NEXT: s_add_u32 s27, s29, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_mul_hi_u32 s30, s23, s28
-; GCN-NEXT: s_mul_i32 s28, s23, s28
+; GCN-NEXT: s_mul_hi_u32 s30, s25, s28
+; GCN-NEXT: s_mul_i32 s28, s25, s28
; GCN-NEXT: s_add_u32 s27, s27, s28
-; GCN-NEXT: s_mul_hi_u32 s29, s23, s25
+; GCN-NEXT: s_mul_hi_u32 s29, s25, s23
; GCN-NEXT: s_addc_u32 s26, s26, s30
; GCN-NEXT: s_addc_u32 s27, s29, 0
-; GCN-NEXT: s_mul_i32 s25, s23, s25
-; GCN-NEXT: s_add_u32 s25, s26, s25
+; GCN-NEXT: s_mul_i32 s23, s25, s23
+; GCN-NEXT: s_add_u32 s23, s26, s23
; GCN-NEXT: s_addc_u32 s26, 0, s27
-; GCN-NEXT: s_add_u32 s24, s24, s25
-; GCN-NEXT: s_addc_u32 s23, s23, s26
-; GCN-NEXT: s_mul_i32 s25, s17, s23
-; GCN-NEXT: s_mul_hi_u32 s26, s17, s24
-; GCN-NEXT: s_add_i32 s25, s26, s25
-; GCN-NEXT: s_mul_i32 s22, s22, s24
-; GCN-NEXT: s_add_i32 s25, s25, s22
-; GCN-NEXT: s_mul_i32 s17, s17, s24
-; GCN-NEXT: s_mul_hi_u32 s26, s23, s17
-; GCN-NEXT: s_mul_i32 s27, s23, s17
-; GCN-NEXT: s_mul_i32 s29, s24, s25
-; GCN-NEXT: s_mul_hi_u32 s17, s24, s17
-; GCN-NEXT: s_mul_hi_u32 s28, s24, s25
+; GCN-NEXT: s_add_u32 s27, s22, s23
+; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT: s_addc_u32 s25, s25, s26
+; GCN-NEXT: s_mul_i32 s22, s17, s25
+; GCN-NEXT: s_mul_hi_u32 s23, s17, s27
+; GCN-NEXT: s_add_i32 s22, s23, s22
+; GCN-NEXT: s_mul_i32 s24, s24, s27
+; GCN-NEXT: s_add_i32 s22, s22, s24
+; GCN-NEXT: s_mul_i32 s17, s17, s27
+; GCN-NEXT: s_mul_hi_u32 s24, s25, s17
+; GCN-NEXT: s_mul_i32 s26, s25, s17
+; GCN-NEXT: s_mul_i32 s29, s27, s22
+; GCN-NEXT: s_mul_hi_u32 s17, s27, s17
+; GCN-NEXT: s_mul_hi_u32 s28, s27, s22
; GCN-NEXT: s_add_u32 s17, s17, s29
; GCN-NEXT: s_addc_u32 s28, 0, s28
-; GCN-NEXT: s_add_u32 s17, s17, s27
-; GCN-NEXT: s_mul_hi_u32 s22, s23, s25
-; GCN-NEXT: s_addc_u32 s17, s28, s26
-; GCN-NEXT: s_addc_u32 s22, s22, 0
-; GCN-NEXT: s_mul_i32 s25, s23, s25
-; GCN-NEXT: s_add_u32 s17, s17, s25
-; GCN-NEXT: s_addc_u32 s22, 0, s22
-; GCN-NEXT: s_add_u32 s17, s24, s17
-; GCN-NEXT: s_addc_u32 s26, s23, s22
+; GCN-NEXT: s_add_u32 s17, s17, s26
+; GCN-NEXT: s_mul_hi_u32 s23, s25, s22
+; GCN-NEXT: s_addc_u32 s17, s28, s24
+; GCN-NEXT: s_addc_u32 s23, s23, 0
+; GCN-NEXT: s_mul_i32 s22, s25, s22
+; GCN-NEXT: s_add_u32 s17, s17, s22
+; GCN-NEXT: s_addc_u32 s24, 0, s23
+; GCN-NEXT: s_add_u32 s17, s27, s17
+; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT: s_addc_u32 s26, s25, s24
; GCN-NEXT: s_ashr_i32 s22, s19, 31
; GCN-NEXT: s_add_u32 s24, s18, s22
; GCN-NEXT: s_mov_b32 s23, s22
@@ -4985,9 +5021,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s17, s6, s17
; GCN-NEXT: s_sub_u32 s17, s24, s17
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s24, s28, s7
; GCN-NEXT: s_sub_u32 s30, s17, s6
; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s31, s24, 0
; GCN-NEXT: s_cmp_ge_u32 s31, s7
; GCN-NEXT: s_cselect_b32 s33, -1, 0
@@ -4997,10 +5035,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s33, s34, s33
; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s24, s24, s7
-; GCN-NEXT: s_sub_u32 s28, s30, s6
+; GCN-NEXT: s_sub_u32 s34, s30, s6
+; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s24, s24, 0
; GCN-NEXT: s_cmp_lg_u32 s33, 0
-; GCN-NEXT: s_cselect_b32 s28, s28, s30
+; GCN-NEXT: s_cselect_b32 s28, s34, s30
; GCN-NEXT: s_cselect_b32 s24, s24, s31
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s19, s25, s19
@@ -5051,7 +5091,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19
; GCN-NEXT: s_sub_u32 s13, 0, s18
-; GCN-NEXT: s_subb_u32 s20, 0, s19
+; GCN-NEXT: s_subb_u32 s22, 0, s19
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5060,52 +5100,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s21, v1
-; GCN-NEXT: v_readfirstlane_b32 s22, v0
-; GCN-NEXT: s_mul_i32 s23, s13, s21
-; GCN-NEXT: s_mul_hi_u32 s25, s13, s22
-; GCN-NEXT: s_mul_i32 s24, s20, s22
-; GCN-NEXT: s_add_i32 s23, s25, s23
-; GCN-NEXT: s_add_i32 s23, s23, s24
-; GCN-NEXT: s_mul_i32 s26, s13, s22
-; GCN-NEXT: s_mul_i32 s25, s22, s23
-; GCN-NEXT: s_mul_hi_u32 s27, s22, s26
-; GCN-NEXT: s_mul_hi_u32 s24, s22, s23
+; GCN-NEXT: v_readfirstlane_b32 s23, v1
+; GCN-NEXT: v_readfirstlane_b32 s20, v0
+; GCN-NEXT: s_mul_i32 s21, s13, s23
+; GCN-NEXT: s_mul_hi_u32 s25, s13, s20
+; GCN-NEXT: s_mul_i32 s24, s22, s20
+; GCN-NEXT: s_add_i32 s21, s25, s21
+; GCN-NEXT: s_add_i32 s21, s21, s24
+; GCN-NEXT: s_mul_i32 s26, s13, s20
+; GCN-NEXT: s_mul_i32 s25, s20, s21
+; GCN-NEXT: s_mul_hi_u32 s27, s20, s26
+; GCN-NEXT: s_mul_hi_u32 s24, s20, s21
; GCN-NEXT: s_add_u32 s25, s27, s25
; GCN-NEXT: s_addc_u32 s24, 0, s24
-; GCN-NEXT: s_mul_hi_u32 s28, s21, s26
-; GCN-NEXT: s_mul_i32 s26, s21, s26
+; GCN-NEXT: s_mul_hi_u32 s28, s23, s26
+; GCN-NEXT: s_mul_i32 s26, s23, s26
; GCN-NEXT: s_add_u32 s25, s25, s26
-; GCN-NEXT: s_mul_hi_u32 s27, s21, s23
+; GCN-NEXT: s_mul_hi_u32 s27, s23, s21
; GCN-NEXT: s_addc_u32 s24, s24, s28
; GCN-NEXT: s_addc_u32 s25, s27, 0
-; GCN-NEXT: s_mul_i32 s23, s21, s23
-; GCN-NEXT: s_add_u32 s23, s24, s23
+; GCN-NEXT: s_mul_i32 s21, s23, s21
+; GCN-NEXT: s_add_u32 s21, s24, s21
; GCN-NEXT: s_addc_u32 s24, 0, s25
-; GCN-NEXT: s_add_u32 s22, s22, s23
-; GCN-NEXT: s_addc_u32 s21, s21, s24
-; GCN-NEXT: s_mul_i32 s23, s13, s21
-; GCN-NEXT: s_mul_hi_u32 s24, s13, s22
-; GCN-NEXT: s_add_i32 s23, s24, s23
-; GCN-NEXT: s_mul_i32 s20, s20, s22
-; GCN-NEXT: s_add_i32 s23, s23, s20
-; GCN-NEXT: s_mul_i32 s13, s13, s22
-; GCN-NEXT: s_mul_hi_u32 s24, s21, s13
-; GCN-NEXT: s_mul_i32 s25, s21, s13
-; GCN-NEXT: s_mul_i32 s27, s22, s23
-; GCN-NEXT: s_mul_hi_u32 s13, s22, s13
-; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
+; GCN-NEXT: s_add_u32 s25, s20, s21
+; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT: s_addc_u32 s23, s23, s24
+; GCN-NEXT: s_mul_i32 s20, s13, s23
+; GCN-NEXT: s_mul_hi_u32 s21, s13, s25
+; GCN-NEXT: s_add_i32 s20, s21, s20
+; GCN-NEXT: s_mul_i32 s22, s22, s25
+; GCN-NEXT: s_add_i32 s20, s20, s22
+; GCN-NEXT: s_mul_i32 s13, s13, s25
+; GCN-NEXT: s_mul_hi_u32 s22, s23, s13
+; GCN-NEXT: s_mul_i32 s24, s23, s13
+; GCN-NEXT: s_mul_i32 s27, s25, s20
+; GCN-NEXT: s_mul_hi_u32 s13, s25, s13
+; GCN-NEXT: s_mul_hi_u32 s26, s25, s20
; GCN-NEXT: s_add_u32 s13, s13, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_add_u32 s13, s13, s25
-; GCN-NEXT: s_mul_hi_u32 s20, s21, s23
-; GCN-NEXT: s_addc_u32 s13, s26, s24
-; GCN-NEXT: s_addc_u32 s20, s20, 0
-; GCN-NEXT: s_mul_i32 s23, s21, s23
-; GCN-NEXT: s_add_u32 s13, s13, s23
-; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_add_u32 s13, s22, s13
-; GCN-NEXT: s_addc_u32 s24, s21, s20
+; GCN-NEXT: s_add_u32 s13, s13, s24
+; GCN-NEXT: s_mul_hi_u32 s21, s23, s20
+; GCN-NEXT: s_addc_u32 s13, s26, s22
+; GCN-NEXT: s_addc_u32 s21, s21, 0
+; GCN-NEXT: s_mul_i32 s20, s23, s20
+; GCN-NEXT: s_add_u32 s13, s13, s20
+; GCN-NEXT: s_addc_u32 s22, 0, s21
+; GCN-NEXT: s_add_u32 s13, s25, s13
+; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT: s_addc_u32 s24, s23, s22
; GCN-NEXT: s_ashr_i32 s20, s15, 31
; GCN-NEXT: s_add_u32 s22, s14, s20
; GCN-NEXT: s_mov_b32 s21, s20
@@ -5134,9 +5178,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s13, s18, s13
; GCN-NEXT: s_sub_u32 s13, s22, s13
; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s22, s26, s19
; GCN-NEXT: s_sub_u32 s28, s13, s18
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s29, s22, 0
; GCN-NEXT: s_cmp_ge_u32 s29, s19
; GCN-NEXT: s_cselect_b32 s30, -1, 0
@@ -5146,10 +5192,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s30, s31, s30
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s22, s22, s19
-; GCN-NEXT: s_sub_u32 s26, s28, s18
+; GCN-NEXT: s_sub_u32 s31, s28, s18
+; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s22, s22, 0
; GCN-NEXT: s_cmp_lg_u32 s30, 0
-; GCN-NEXT: s_cselect_b32 s26, s26, s28
+; GCN-NEXT: s_cselect_b32 s26, s31, s28
; GCN-NEXT: s_cselect_b32 s22, s22, s29
; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s15, s23, s15
@@ -5209,7 +5257,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15
; GCN-NEXT: s_sub_u32 s9, 0, s14
-; GCN-NEXT: s_subb_u32 s16, 0, s15
+; GCN-NEXT: s_subb_u32 s18, 0, s15
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5218,52 +5266,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s17, v1
-; GCN-NEXT: v_readfirstlane_b32 s18, v0
-; GCN-NEXT: s_mul_i32 s19, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s21, s9, s18
-; GCN-NEXT: s_mul_i32 s20, s16, s18
-; GCN-NEXT: s_add_i32 s19, s21, s19
-; GCN-NEXT: s_add_i32 s19, s19, s20
-; GCN-NEXT: s_mul_i32 s22, s9, s18
-; GCN-NEXT: s_mul_i32 s21, s18, s19
-; GCN-NEXT: s_mul_hi_u32 s23, s18, s22
-; GCN-NEXT: s_mul_hi_u32 s20, s18, s19
+; GCN-NEXT: v_readfirstlane_b32 s19, v1
+; GCN-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-NEXT: s_mul_i32 s17, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s21, s9, s16
+; GCN-NEXT: s_mul_i32 s20, s18, s16
+; GCN-NEXT: s_add_i32 s17, s21, s17
+; GCN-NEXT: s_add_i32 s17, s17, s20
+; GCN-NEXT: s_mul_i32 s22, s9, s16
+; GCN-NEXT: s_mul_i32 s21, s16, s17
+; GCN-NEXT: s_mul_hi_u32 s23, s16, s22
+; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
; GCN-NEXT: s_add_u32 s21, s23, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_mul_hi_u32 s24, s17, s22
-; GCN-NEXT: s_mul_i32 s22, s17, s22
+; GCN-NEXT: s_mul_hi_u32 s24, s19, s22
+; GCN-NEXT: s_mul_i32 s22, s19, s22
; GCN-NEXT: s_add_u32 s21, s21, s22
-; GCN-NEXT: s_mul_hi_u32 s23, s17, s19
+; GCN-NEXT: s_mul_hi_u32 s23, s19, s17
; GCN-NEXT: s_addc_u32 s20, s20, s24
; GCN-NEXT: s_addc_u32 s21, s23, 0
-; GCN-NEXT: s_mul_i32 s19, s17, s19
-; GCN-NEXT: s_add_u32 s19, s20, s19
+; GCN-NEXT: s_mul_i32 s17, s19, s17
+; GCN-NEXT: s_add_u32 s17, s20, s17
; GCN-NEXT: s_addc_u32 s20, 0, s21
-; GCN-NEXT: s_add_u32 s18, s18, s19
-; GCN-NEXT: s_addc_u32 s17, s17, s20
-; GCN-NEXT: s_mul_i32 s19, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s20, s9, s18
-; GCN-NEXT: s_add_i32 s19, s20, s19
-; GCN-NEXT: s_mul_i32 s16, s16, s18
-; GCN-NEXT: s_add_i32 s19, s19, s16
-; GCN-NEXT: s_mul_i32 s9, s9, s18
-; GCN-NEXT: s_mul_hi_u32 s20, s17, s9
-; GCN-NEXT: s_mul_i32 s21, s17, s9
-; GCN-NEXT: s_mul_i32 s23, s18, s19
-; GCN-NEXT: s_mul_hi_u32 s9, s18, s9
-; GCN-NEXT: s_mul_hi_u32 s22, s18, s19
+; GCN-NEXT: s_add_u32 s21, s16, s17
+; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT: s_addc_u32 s19, s19, s20
+; GCN-NEXT: s_mul_i32 s16, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s17, s9, s21
+; GCN-NEXT: s_add_i32 s16, s17, s16
+; GCN-NEXT: s_mul_i32 s18, s18, s21
+; GCN-NEXT: s_add_i32 s16, s16, s18
+; GCN-NEXT: s_mul_i32 s9, s9, s21
+; GCN-NEXT: s_mul_hi_u32 s18, s19, s9
+; GCN-NEXT: s_mul_i32 s20, s19, s9
+; GCN-NEXT: s_mul_i32 s23, s21, s16
+; GCN-NEXT: s_mul_hi_u32 s9, s21, s9
+; GCN-NEXT: s_mul_hi_u32 s22, s21, s16
; GCN-NEXT: s_add_u32 s9, s9, s23
; GCN-NEXT: s_addc_u32 s22, 0, s22
-; GCN-NEXT: s_add_u32 s9, s9, s21
-; GCN-NEXT: s_mul_hi_u32 s16, s17, s19
-; GCN-NEXT: s_addc_u32 s9, s22, s20
-; GCN-NEXT: s_addc_u32 s16, s16, 0
-; GCN-NEXT: s_mul_i32 s19, s17, s19
-; GCN-NEXT: s_add_u32 s9, s9, s19
-; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_add_u32 s9, s18, s9
-; GCN-NEXT: s_addc_u32 s20, s17, s16
+; GCN-NEXT: s_add_u32 s9, s9, s20
+; GCN-NEXT: s_mul_hi_u32 s17, s19, s16
+; GCN-NEXT: s_addc_u32 s9, s22, s18
+; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_mul_i32 s16, s19, s16
+; GCN-NEXT: s_add_u32 s9, s9, s16
+; GCN-NEXT: s_addc_u32 s18, 0, s17
+; GCN-NEXT: s_add_u32 s9, s21, s9
+; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT: s_addc_u32 s20, s19, s18
; GCN-NEXT: s_ashr_i32 s16, s11, 31
; GCN-NEXT: s_add_u32 s18, s10, s16
; GCN-NEXT: s_mov_b32 s17, s16
@@ -5292,9 +5344,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s14, s9
; GCN-NEXT: s_sub_u32 s9, s18, s9
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s18, s22, s15
; GCN-NEXT: s_sub_u32 s24, s9, s14
; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s25, s18, 0
; GCN-NEXT: s_cmp_ge_u32 s25, s15
; GCN-NEXT: s_cselect_b32 s26, -1, 0
@@ -5304,10 +5358,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s26, s27, s26
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s18, s18, s15
-; GCN-NEXT: s_sub_u32 s22, s24, s14
+; GCN-NEXT: s_sub_u32 s27, s24, s14
+; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s18, s18, 0
; GCN-NEXT: s_cmp_lg_u32 s26, 0
-; GCN-NEXT: s_cselect_b32 s22, s22, s24
+; GCN-NEXT: s_cselect_b32 s22, s27, s24
; GCN-NEXT: s_cselect_b32 s18, s18, s25
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s11, s19, s11
@@ -5364,7 +5420,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s12, 0, s11
+; GCN-NEXT: s_subb_u32 s14, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5373,52 +5429,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
-; GCN-NEXT: s_mul_i32 s16, s12, s14
-; GCN-NEXT: s_add_i32 s15, s17, s15
-; GCN-NEXT: s_add_i32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s14
-; GCN-NEXT: s_mul_i32 s17, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s13, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
+; GCN-NEXT: s_mul_i32 s16, s14, s12
+; GCN-NEXT: s_add_i32 s13, s17, s13
+; GCN-NEXT: s_add_i32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s12
+; GCN-NEXT: s_mul_i32 s17, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
-; GCN-NEXT: s_mul_i32 s18, s13, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
+; GCN-NEXT: s_mul_i32 s18, s15, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s13, s15, s13
+; GCN-NEXT: s_add_u32 s13, s16, s13
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s14, s14, s15
-; GCN-NEXT: s_addc_u32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
-; GCN-NEXT: s_add_i32 s15, s16, s15
-; GCN-NEXT: s_mul_i32 s12, s12, s14
-; GCN-NEXT: s_add_i32 s15, s15, s12
-; GCN-NEXT: s_mul_i32 s3, s3, s14
-; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
-; GCN-NEXT: s_mul_i32 s17, s13, s3
-; GCN-NEXT: s_mul_i32 s19, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT: s_add_u32 s17, s12, s13
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s12, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s14, s14, s17
+; GCN-NEXT: s_add_i32 s12, s12, s14
+; GCN-NEXT: s_mul_i32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
+; GCN-NEXT: s_mul_i32 s16, s15, s3
+; GCN-NEXT: s_mul_i32 s19, s17, s12
+; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
-; GCN-NEXT: s_addc_u32 s3, s18, s16
-; GCN-NEXT: s_addc_u32 s12, s12, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s3, s3, s15
-; GCN-NEXT: s_addc_u32 s12, 0, s12
-; GCN-NEXT: s_add_u32 s3, s14, s3
-; GCN-NEXT: s_addc_u32 s16, s13, s12
+; GCN-NEXT: s_add_u32 s3, s3, s16
+; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
+; GCN-NEXT: s_addc_u32 s3, s18, s14
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s12, s15, s12
+; GCN-NEXT: s_add_u32 s3, s3, s12
+; GCN-NEXT: s_addc_u32 s14, 0, s13
+; GCN-NEXT: s_add_u32 s3, s17, s3
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s16, s15, s14
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -5447,9 +5507,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -5459,10 +5521,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s18, s20, s10
+; GCN-NEXT: s_sub_u32 s23, s20, s10
+; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s18, s20
+; GCN-NEXT: s_cselect_b32 s18, s23, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -6235,9 +6299,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v8
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -6247,10 +6313,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s16, s18, s6
+; TONGA-NEXT: s_sub_u32 s21, s18, s6
+; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s16, s18
+; TONGA-NEXT: s_cselect_b32 s16, s21, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ea9bb04..33b0a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -51,9 +51,10 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s0, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -84,6 +85,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_add_u32 s11, s14, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s1, s12, s10
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
@@ -113,43 +115,46 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: s_add_i32 s5, s10, s5
; GCN-NEXT: s_mul_i32 s10, s9, s4
-; GCN-NEXT: s_add_i32 s12, s5, s10
-; GCN-NEXT: s_sub_i32 s10, s7, s12
+; GCN-NEXT: s_add_i32 s10, s5, s10
+; GCN-NEXT: s_sub_i32 s11, s7, s10
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s11, s4, s5
-; GCN-NEXT: s_subb_u32 s13, s10, s9
-; GCN-NEXT: s_sub_u32 s14, s6, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s15, s10, s11
-; GCN-NEXT: s_subb_u32 s15, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s15, s9
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s8
-; GCN-NEXT: s_cselect_b32 s17, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s15, s9
-; GCN-NEXT: s_cselect_b32 s16, s17, s16
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s13, s13, s9
-; GCN-NEXT: s_sub_u32 s17, s14, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s13, 0
-; GCN-NEXT: s_cmp_lg_u32 s16, 0
-; GCN-NEXT: s_cselect_b32 s11, s17, s14
-; GCN-NEXT: s_cselect_b32 s10, s10, s15
+; GCN-NEXT: s_or_b32 s12, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s11, s11, s9
+; GCN-NEXT: s_sub_u32 s13, s6, s8
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_subb_u32 s4, s7, s12
-; GCN-NEXT: s_cmp_ge_u32 s4, s9
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s14, s11, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s9
; GCN-NEXT: s_cselect_b32 s5, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s8
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s9
+; GCN-NEXT: s_cselect_b32 s15, s15, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s11, s11, s9
+; GCN-NEXT: s_sub_u32 s16, s13, s8
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s4, s11, 0
+; GCN-NEXT: s_cmp_lg_u32 s15, 0
+; GCN-NEXT: s_cselect_b32 s5, s16, s13
+; GCN-NEXT: s_cselect_b32 s4, s4, s14
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s7, s7, s10
+; GCN-NEXT: s_cmp_ge_u32 s7, s9
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s8
-; GCN-NEXT: s_cselect_b32 s7, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s4, s9
-; GCN-NEXT: s_cselect_b32 s5, s7, s5
-; GCN-NEXT: s_cmp_lg_u32 s5, 0
-; GCN-NEXT: s_cselect_b32 s4, s10, s4
-; GCN-NEXT: s_cselect_b32 s5, s11, s6
+; GCN-NEXT: s_cselect_b32 s8, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s7, s9
+; GCN-NEXT: s_cselect_b32 s8, s8, s10
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s4, s4, s7
+; GCN-NEXT: s_cselect_b32 s5, s5, s6
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -182,6 +187,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -215,6 +221,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
+; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -1009,9 +1016,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s8, s9
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s8, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s9, v0
@@ -1042,6 +1050,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_add_u32 s11, s14, s8
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s10, s12, s10
; GCN-NEXT: s_ashr_i32 s8, s7, 31
; GCN-NEXT: s_add_u32 s6, s6, s8
@@ -1074,43 +1083,46 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: v_readfirstlane_b32 s12, v0
; GCN-NEXT: s_add_i32 s11, s12, s11
; GCN-NEXT: s_mul_i32 s12, s5, s10
-; GCN-NEXT: s_add_i32 s14, s11, s12
-; GCN-NEXT: s_sub_i32 s12, s7, s14
+; GCN-NEXT: s_add_i32 s12, s11, s12
+; GCN-NEXT: s_sub_i32 s13, s7, s12
; GCN-NEXT: s_mul_i32 s10, s4, s10
; GCN-NEXT: s_sub_u32 s6, s6, s10
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s13, s10, s11
-; GCN-NEXT: s_subb_u32 s15, s12, s5
-; GCN-NEXT: s_sub_u32 s16, s6, s4
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_or_b32 s17, s12, s13
-; GCN-NEXT: s_subb_u32 s17, s15, 0
-; GCN-NEXT: s_cmp_ge_u32 s17, s5
-; GCN-NEXT: s_cselect_b32 s18, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s16, s4
-; GCN-NEXT: s_cselect_b32 s19, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s17, s5
-; GCN-NEXT: s_cselect_b32 s18, s19, s18
-; GCN-NEXT: s_or_b32 s12, s12, s13
-; GCN-NEXT: s_subb_u32 s15, s15, s5
-; GCN-NEXT: s_sub_u32 s19, s16, s4
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_or_b32 s12, s12, s13
-; GCN-NEXT: s_subb_u32 s12, s15, 0
-; GCN-NEXT: s_cmp_lg_u32 s18, 0
-; GCN-NEXT: s_cselect_b32 s13, s19, s16
-; GCN-NEXT: s_cselect_b32 s12, s12, s17
+; GCN-NEXT: s_or_b32 s14, s10, s11
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_subb_u32 s13, s13, s5
+; GCN-NEXT: s_sub_u32 s15, s6, s4
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_subb_u32 s16, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s16, s5
+; GCN-NEXT: s_cselect_b32 s11, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s15, s4
+; GCN-NEXT: s_cselect_b32 s17, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s16, s5
+; GCN-NEXT: s_cselect_b32 s17, s17, s11
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_subb_u32 s13, s13, s5
+; GCN-NEXT: s_sub_u32 s18, s15, s4
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s7, s7, s14
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_cmp_lg_u32 s17, 0
+; GCN-NEXT: s_cselect_b32 s11, s18, s15
+; GCN-NEXT: s_cselect_b32 s10, s10, s16
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_subb_u32 s7, s7, s12
; GCN-NEXT: s_cmp_ge_u32 s7, s5
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
+; GCN-NEXT: s_cselect_b32 s12, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s4
; GCN-NEXT: s_cselect_b32 s4, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s7, s5
-; GCN-NEXT: s_cselect_b32 s4, s4, s10
+; GCN-NEXT: s_cselect_b32 s4, s4, s12
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s5, s12, s7
-; GCN-NEXT: s_cselect_b32 s4, s13, s6
+; GCN-NEXT: s_cselect_b32 s5, s10, s7
+; GCN-NEXT: s_cselect_b32 s4, s11, s6
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
; GCN-NEXT: s_sub_u32 s4, s4, s8
; GCN-NEXT: s_subb_u32 s5, s5, s8
@@ -1158,6 +1170,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_add_u32 s16, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
+; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
; GCN-IR-NEXT: s_addc_u32 s10, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
@@ -1191,6 +1204,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_add_u32 s18, s18, 1
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
+; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
; GCN-IR-NEXT: s_addc_u32 s19, s19, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3]
@@ -1355,9 +1369,10 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s6, s7
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_or_b32 s6, s6, s7
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s6, s2, s9
; GCN-NEXT: v_readfirstlane_b32 s7, v0
@@ -1388,6 +1403,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s2, s11, s2
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: s_or_b32 s6, s6, s7
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_addc_u32 s6, s9, s8
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s6, 24
@@ -1402,42 +1418,45 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mul_i32 s7, s5, s6
; GCN-NEXT: s_mul_i32 s6, s4, s6
; GCN-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NEXT: s_add_i32 s10, s8, s7
-; GCN-NEXT: s_sub_i32 s8, 0, s10
-; GCN-NEXT: s_sub_u32 s11, 24, s6
+; GCN-NEXT: s_add_i32 s8, s8, s7
+; GCN-NEXT: s_sub_i32 s9, 0, s8
+; GCN-NEXT: s_sub_u32 s10, 24, s6
+; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT: s_or_b32 s11, s6, s7
+; GCN-NEXT: s_cmp_lg_u32 s11, 0
+; GCN-NEXT: s_subb_u32 s9, s9, s5
+; GCN-NEXT: s_sub_u32 s12, s10, s4
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s9, s6, s7
-; GCN-NEXT: s_subb_u32 s12, s8, s5
-; GCN-NEXT: s_sub_u32 s13, s11, s4
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
-; GCN-NEXT: s_subb_u32 s14, s12, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s5
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s4
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s5
-; GCN-NEXT: s_cselect_b32 s15, s16, s15
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s12, s12, s5
-; GCN-NEXT: s_sub_u32 s16, s13, s4
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s12, 0
-; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s9, s16, s13
-; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_subb_u32 s6, 0, s10
-; GCN-NEXT: s_cmp_ge_u32 s6, s5
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-NEXT: s_subb_u32 s13, s9, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s5
; GCN-NEXT: s_cselect_b32 s7, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s11, s4
+; GCN-NEXT: s_cmp_ge_u32 s12, s4
+; GCN-NEXT: s_cselect_b32 s14, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s13, s5
+; GCN-NEXT: s_cselect_b32 s14, s14, s7
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-NEXT: s_subb_u32 s9, s9, s5
+; GCN-NEXT: s_sub_u32 s15, s12, s4
+; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT: s_or_b32 s6, s6, s7
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-NEXT: s_subb_u32 s6, s9, 0
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_cselect_b32 s7, s15, s12
+; GCN-NEXT: s_cselect_b32 s6, s6, s13
+; GCN-NEXT: s_cmp_lg_u32 s11, 0
+; GCN-NEXT: s_subb_u32 s8, 0, s8
+; GCN-NEXT: s_cmp_ge_u32 s8, s5
+; GCN-NEXT: s_cselect_b32 s9, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s10, s4
; GCN-NEXT: s_cselect_b32 s4, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s6, s5
-; GCN-NEXT: s_cselect_b32 s4, s4, s7
+; GCN-NEXT: s_cmp_eq_u32 s8, s5
+; GCN-NEXT: s_cselect_b32 s4, s4, s9
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s4, s8, s6
-; GCN-NEXT: s_cselect_b32 s5, s9, s11
+; GCN-NEXT: s_cselect_b32 s4, s6, s8
+; GCN-NEXT: s_cselect_b32 s5, s7, s10
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1470,6 +1489,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s8, s2, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s9, s10, s11
+; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s2, 63, s2
@@ -1502,6 +1522,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bdd22f25..bb5918b2 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -18,6 +18,7 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_addc_u32 s3, s3, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -34,9 +35,11 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s2, s2, s4
-; VI-NEXT: s_addc_u32 s3, s3, s5
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_addc_u32 s3, s3, s5
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
@@ -50,12 +53,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s4, s2, s6
-; GFX9-NEXT: s_addc_u32 s5, s3, s7
+; GFX9-NEXT: s_add_u32 s6, s2, s6
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s4, s3, s7
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -68,6 +73,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s2, s2, s6
+; GFX10-NEXT: s_cselect_b32 s4, -1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s3, s3, s7
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -84,12 +91,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, s4
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_addc_u32 s3, s3, s5
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -435,6 +444,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_add_u32 s4, s4, s6
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
; SI-NEXT: s_or_b32 s6, s12, s13
+; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_addc_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -455,14 +465,16 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_addc_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -474,10 +486,12 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s12, s14
-; GFX9-NEXT: s_addc_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_add_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_addc_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -490,8 +504,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s12, s14
-; GFX10-NEXT: s_addc_u32 s1, s13, s15
+; GFX10-NEXT: s_cselect_b32 s1, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-NEXT: s_addc_u32 s1, s13, s15
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -504,8 +520,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_addc_u32 s5, s5, s7
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index fd461ac..41199b0 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -148,6 +148,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -181,6 +182,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5]
@@ -829,9 +831,10 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s4, s6, s9
; GCN-NEXT: v_readfirstlane_b32 s5, v0
@@ -862,6 +865,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s8, s11, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s4, s9, s6
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
@@ -870,50 +874,52 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_u32 s4, s8, s4
-; GCN-NEXT: s_addc_u32 s10, 0, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NEXT: s_addc_u32 s8, 0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mul_i32 s0, s3, s10
+; GCN-NEXT: s_mul_i32 s0, s3, s8
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s11, s1, s0
-; GCN-NEXT: s_sub_i32 s8, 0, s11
-; GCN-NEXT: s_mul_i32 s0, s2, s10
-; GCN-NEXT: s_sub_u32 s12, 24, s0
+; GCN-NEXT: s_add_i32 s9, s1, s0
+; GCN-NEXT: s_sub_i32 s10, 0, s9
+; GCN-NEXT: s_mul_i32 s0, s2, s8
+; GCN-NEXT: s_sub_u32 s11, 24, s0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s12, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s10, s10, s3
+; GCN-NEXT: s_sub_u32 s13, s11, s2
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s9, s0, s1
-; GCN-NEXT: s_subb_u32 s13, s8, s3
-; GCN-NEXT: s_sub_u32 s14, s12, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s8, s3
-; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s2
-; GCN-NEXT: s_cselect_b32 s13, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s8, s3
-; GCN-NEXT: s_cselect_b32 s8, s13, s9
-; GCN-NEXT: s_add_u32 s9, s10, 1
-; GCN-NEXT: s_addc_u32 s13, 0, 0
-; GCN-NEXT: s_add_u32 s14, s10, 2
-; GCN-NEXT: s_addc_u32 s15, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s8, s14, s9
-; GCN-NEXT: s_cselect_b32 s9, s15, s13
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_subb_u32 s0, 0, s11
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_subb_u32 s0, s10, 0
; GCN-NEXT: s_cmp_ge_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s12, s2
-; GCN-NEXT: s_cselect_b32 s2, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s2
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s0, s3
-; GCN-NEXT: s_cselect_b32 s0, s2, s1
+; GCN-NEXT: s_cselect_b32 s0, s10, s1
+; GCN-NEXT: s_add_u32 s1, s8, 1
+; GCN-NEXT: s_addc_u32 s10, 0, 0
+; GCN-NEXT: s_add_u32 s13, s8, 2
+; GCN-NEXT: s_addc_u32 s14, 0, 0
; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_cselect_b32 s0, s9, 0
-; GCN-NEXT: s_cselect_b32 s1, s8, s10
-; GCN-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: s_cselect_b32 s0, s13, s1
+; GCN-NEXT: s_cselect_b32 s1, s14, s10
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s9, 0, s9
+; GCN-NEXT: s_cmp_ge_u32 s9, s3
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s11, s2
+; GCN-NEXT: s_cselect_b32 s2, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s9, s3
+; GCN-NEXT: s_cselect_b32 s2, s2, s10
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cselect_b32 s1, s1, 0
+; GCN-NEXT: s_cselect_b32 s0, s0, s8
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
@@ -939,6 +945,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -971,6 +978,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1309,6 +1317,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1338,6 +1347,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_or_b32 s12, s12, s13
+; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 137dc1f..cdcc914 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -51,9 +51,10 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s0, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -84,6 +85,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_add_u32 s11, s14, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s1, s12, s10
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
@@ -113,43 +115,46 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: s_add_i32 s5, s10, s5
; GCN-NEXT: s_mul_i32 s10, s9, s4
-; GCN-NEXT: s_add_i32 s12, s5, s10
-; GCN-NEXT: s_sub_i32 s10, s7, s12
+; GCN-NEXT: s_add_i32 s10, s5, s10
+; GCN-NEXT: s_sub_i32 s11, s7, s10
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s11, s4, s5
-; GCN-NEXT: s_subb_u32 s13, s10, s9
-; GCN-NEXT: s_sub_u32 s14, s6, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s15, s10, s11
-; GCN-NEXT: s_subb_u32 s15, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s15, s9
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s8
-; GCN-NEXT: s_cselect_b32 s17, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s15, s9
-; GCN-NEXT: s_cselect_b32 s16, s17, s16
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s13, s13, s9
-; GCN-NEXT: s_sub_u32 s17, s14, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s13, 0
-; GCN-NEXT: s_cmp_lg_u32 s16, 0
-; GCN-NEXT: s_cselect_b32 s11, s17, s14
-; GCN-NEXT: s_cselect_b32 s10, s10, s15
+; GCN-NEXT: s_or_b32 s12, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s11, s11, s9
+; GCN-NEXT: s_sub_u32 s13, s6, s8
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_subb_u32 s4, s7, s12
-; GCN-NEXT: s_cmp_ge_u32 s4, s9
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s14, s11, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s9
; GCN-NEXT: s_cselect_b32 s5, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s8
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s9
+; GCN-NEXT: s_cselect_b32 s15, s15, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s11, s11, s9
+; GCN-NEXT: s_sub_u32 s16, s13, s8
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s4, s11, 0
+; GCN-NEXT: s_cmp_lg_u32 s15, 0
+; GCN-NEXT: s_cselect_b32 s5, s16, s13
+; GCN-NEXT: s_cselect_b32 s4, s4, s14
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s7, s7, s10
+; GCN-NEXT: s_cmp_ge_u32 s7, s9
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s8
-; GCN-NEXT: s_cselect_b32 s7, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s4, s9
-; GCN-NEXT: s_cselect_b32 s5, s7, s5
-; GCN-NEXT: s_cmp_lg_u32 s5, 0
-; GCN-NEXT: s_cselect_b32 s4, s10, s4
-; GCN-NEXT: s_cselect_b32 s5, s11, s6
+; GCN-NEXT: s_cselect_b32 s8, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s7, s9
+; GCN-NEXT: s_cselect_b32 s8, s8, s10
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s4, s4, s7
+; GCN-NEXT: s_cselect_b32 s5, s5, s6
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -182,6 +187,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -215,6 +221,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
+; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -846,9 +853,10 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s4, s6, s9
; GCN-NEXT: v_readfirstlane_b32 s5, v0
@@ -879,6 +887,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s8, s11, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s4, s9, s6
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
@@ -894,43 +903,46 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_mul_i32 s0, s3, s8
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s10, s1, s0
-; GCN-NEXT: s_sub_i32 s9, 0, s10
+; GCN-NEXT: s_add_i32 s9, s1, s0
+; GCN-NEXT: s_sub_i32 s10, 0, s9
; GCN-NEXT: s_mul_i32 s0, s2, s8
-; GCN-NEXT: s_sub_u32 s11, 24, s0
+; GCN-NEXT: s_sub_u32 s8, 24, s0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s11, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s11, 0
+; GCN-NEXT: s_subb_u32 s10, s10, s3
+; GCN-NEXT: s_sub_u32 s12, s8, s2
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s8, s0, s1
-; GCN-NEXT: s_subb_u32 s12, s9, s3
-; GCN-NEXT: s_sub_u32 s13, s11, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
-; GCN-NEXT: s_subb_u32 s14, s12, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s3
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s2
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s3
-; GCN-NEXT: s_cselect_b32 s15, s16, s15
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s12, s12, s3
-; GCN-NEXT: s_sub_u32 s16, s13, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s12, 0
-; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s9, s16, s13
-; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_subb_u32 s0, 0, s10
-; GCN-NEXT: s_cmp_ge_u32 s0, s3
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_subb_u32 s13, s10, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s11, s2
+; GCN-NEXT: s_cmp_ge_u32 s12, s2
+; GCN-NEXT: s_cselect_b32 s14, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s13, s3
+; GCN-NEXT: s_cselect_b32 s14, s14, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_subb_u32 s10, s10, s3
+; GCN-NEXT: s_sub_u32 s15, s12, s2
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_subb_u32 s0, s10, 0
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_cselect_b32 s1, s15, s12
+; GCN-NEXT: s_cselect_b32 s0, s0, s13
+; GCN-NEXT: s_cmp_lg_u32 s11, 0
+; GCN-NEXT: s_subb_u32 s9, 0, s9
+; GCN-NEXT: s_cmp_ge_u32 s9, s3
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s8, s2
; GCN-NEXT: s_cselect_b32 s2, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s0, s3
-; GCN-NEXT: s_cselect_b32 s1, s2, s1
-; GCN-NEXT: s_cmp_lg_u32 s1, 0
-; GCN-NEXT: s_cselect_b32 s0, s8, s0
-; GCN-NEXT: s_cselect_b32 s1, s9, s11
+; GCN-NEXT: s_cmp_eq_u32 s9, s3
+; GCN-NEXT: s_cselect_b32 s2, s2, s10
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cselect_b32 s0, s0, s9
+; GCN-NEXT: s_cselect_b32 s1, s1, s8
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -958,6 +970,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -990,6 +1003,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1079,6 +1093,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1108,6 +1123,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s12, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_or_b32 s14, s14, s15
+; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0
; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index e8db647..d67a7b1 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -18,6 +18,7 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_subb_u32 s3, s3, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -34,9 +35,11 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_sub_u32 s2, s2, s4
-; VI-NEXT: s_subb_u32 s3, s3, s5
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_subb_u32 s3, s3, s5
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
@@ -50,12 +53,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s4, s2, s6
-; GFX9-NEXT: s_subb_u32 s5, s3, s7
+; GFX9-NEXT: s_sub_u32 s6, s2, s6
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_subb_u32 s4, s3, s7
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -68,6 +73,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s2, s2, s6
+; GFX10-NEXT: s_cselect_b32 s4, -1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_subb_u32 s3, s3, s7
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -84,12 +91,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s2, s2, s4
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_subb_u32 s3, s3, s5
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -434,6 +443,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_sub_u32 s4, s4, s6
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
; SI-NEXT: s_or_b32 s6, s12, s13
+; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_subb_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -454,14 +464,16 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_subb_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -473,10 +485,12 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s0, s12, s14
-; GFX9-NEXT: s_subb_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_sub_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_subb_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -489,8 +503,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s0, s12, s14
-; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: s_cselect_b32 s1, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-NEXT: s_subb_u32 s1, s13, s15
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -503,8 +519,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_subb_u32 s5, s5, s7
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 28c6b40..75db387 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -774,40 +774,44 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_add_u32 s11, s12, s11
; GFX1032-NEXT: s_addc_u32 s12, 0, s13
; GFX1032-NEXT: s_add_u32 s8, s8, s11
+; GFX1032-NEXT: s_cselect_b32 s11, -1, 0
+; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8
+; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1032-NEXT: s_mul_i32 s11, s9, s8
; GFX1032-NEXT: s_addc_u32 s5, s5, s12
-; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1032-NEXT: s_mul_i32 s12, s9, s8
-; GFX1032-NEXT: s_mul_i32 s9, s9, s5
; GFX1032-NEXT: s_mul_i32 s10, s10, s8
-; GFX1032-NEXT: s_add_i32 s9, s11, s9
-; GFX1032-NEXT: s_mul_i32 s11, s5, s12
+; GFX1032-NEXT: s_mul_i32 s9, s9, s5
+; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11
+; GFX1032-NEXT: s_add_i32 s9, s13, s9
+; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11
; GFX1032-NEXT: s_add_i32 s9, s9, s10
-; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX1032-NEXT: s_mul_i32 s10, s5, s11
; GFX1032-NEXT: s_mul_i32 s15, s8, s9
; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1032-NEXT: s_add_u32 s10, s10, s15
-; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12
+; GFX1032-NEXT: s_add_u32 s12, s12, s15
; GFX1032-NEXT: s_addc_u32 s14, 0, s14
-; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9
-; GFX1032-NEXT: s_add_u32 s10, s10, s11
+; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9
+; GFX1032-NEXT: s_add_u32 s10, s12, s10
; GFX1032-NEXT: s_mul_i32 s9, s5, s9
; GFX1032-NEXT: s_addc_u32 s10, s14, s13
-; GFX1032-NEXT: s_addc_u32 s11, s12, 0
+; GFX1032-NEXT: s_addc_u32 s11, s11, 0
; GFX1032-NEXT: s_add_u32 s9, s10, s9
; GFX1032-NEXT: s_addc_u32 s10, 0, s11
; GFX1032-NEXT: s_add_u32 s8, s8, s9
+; GFX1032-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8
+; GFX1032-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1032-NEXT: s_addc_u32 s5, s5, s10
-; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1032-NEXT: s_mul_i32 s12, s2, s5
-; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5
-; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8
; GFX1032-NEXT: s_mul_i32 s8, s3, s8
-; GFX1032-NEXT: s_add_u32 s9, s9, s12
-; GFX1032-NEXT: s_addc_u32 s11, 0, s11
+; GFX1032-NEXT: s_mul_i32 s12, s2, s5
+; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5
+; GFX1032-NEXT: s_add_u32 s11, s11, s12
+; GFX1032-NEXT: s_addc_u32 s10, 0, s10
; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1032-NEXT: s_add_u32 s8, s9, s8
+; GFX1032-NEXT: s_add_u32 s8, s11, s8
; GFX1032-NEXT: s_mul_i32 s5, s3, s5
-; GFX1032-NEXT: s_addc_u32 s8, s11, s10
+; GFX1032-NEXT: s_addc_u32 s8, s10, s9
; GFX1032-NEXT: s_addc_u32 s9, s13, 0
; GFX1032-NEXT: s_add_u32 s5, s8, s5
; GFX1032-NEXT: s_addc_u32 s8, 0, s9
@@ -820,8 +824,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_sub_i32 s11, s3, s9
; GFX1032-NEXT: s_sub_u32 s10, s2, s10
; GFX1032-NEXT: s_cselect_b32 s12, -1, 0
+; GFX1032-NEXT: s_cmp_lg_u32 s12, 0
; GFX1032-NEXT: s_subb_u32 s11, s11, s1
; GFX1032-NEXT: s_sub_u32 s13, s10, s0
+; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
+; GFX1032-NEXT: s_cmp_lg_u32 s14, 0
; GFX1032-NEXT: s_subb_u32 s11, s11, 0
; GFX1032-NEXT: s_cmp_ge_u32 s11, s1
; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
@@ -894,8 +901,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1
-; GFX1064-NEXT: s_sub_u32 s8, 0, s0
-; GFX1064-NEXT: s_subb_u32 s9, 0, s1
+; GFX1064-NEXT: s_sub_u32 s9, 0, s0
+; GFX1064-NEXT: s_subb_u32 s10, 0, s1
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX1064-NEXT: v_rcp_f32_e32 v0, v0
; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -904,102 +911,109 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1064-NEXT: v_readfirstlane_b32 s5, v0
-; GFX1064-NEXT: s_mul_i32 s10, s8, s4
-; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5
-; GFX1064-NEXT: s_mul_i32 s11, s9, s5
-; GFX1064-NEXT: s_add_i32 s10, s12, s10
-; GFX1064-NEXT: s_mul_i32 s13, s8, s5
-; GFX1064-NEXT: s_add_i32 s10, s10, s11
-; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13
-; GFX1064-NEXT: s_mul_i32 s15, s5, s10
-; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13
-; GFX1064-NEXT: s_mul_i32 s11, s4, s13
-; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10
+; GFX1064-NEXT: v_readfirstlane_b32 s8, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
+; GFX1064-NEXT: s_mul_i32 s5, s9, s8
+; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4
+; GFX1064-NEXT: s_mul_i32 s11, s10, s4
+; GFX1064-NEXT: s_add_i32 s5, s12, s5
+; GFX1064-NEXT: s_mul_i32 s13, s9, s4
+; GFX1064-NEXT: s_add_i32 s5, s5, s11
+; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13
+; GFX1064-NEXT: s_mul_i32 s15, s4, s5
+; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13
+; GFX1064-NEXT: s_mul_i32 s11, s8, s13
+; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5
; GFX1064-NEXT: s_add_u32 s12, s12, s15
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10
+; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5
; GFX1064-NEXT: s_add_u32 s11, s12, s11
-; GFX1064-NEXT: s_mul_i32 s10, s4, s10
+; GFX1064-NEXT: s_mul_i32 s5, s8, s5
; GFX1064-NEXT: s_addc_u32 s11, s13, s14
; GFX1064-NEXT: s_addc_u32 s12, s16, 0
-; GFX1064-NEXT: s_add_u32 s10, s11, s10
+; GFX1064-NEXT: s_add_u32 s5, s11, s5
; GFX1064-NEXT: s_addc_u32 s11, 0, s12
-; GFX1064-NEXT: s_add_u32 s5, s5, s10
-; GFX1064-NEXT: s_addc_u32 s4, s4, s11
-; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5
-; GFX1064-NEXT: s_mul_i32 s11, s8, s5
-; GFX1064-NEXT: s_mul_i32 s8, s8, s4
-; GFX1064-NEXT: s_mul_i32 s9, s9, s5
-; GFX1064-NEXT: s_add_i32 s8, s10, s8
-; GFX1064-NEXT: s_mul_i32 s10, s4, s11
-; GFX1064-NEXT: s_add_i32 s8, s8, s9
-; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11
-; GFX1064-NEXT: s_mul_i32 s14, s5, s8
-; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8
-; GFX1064-NEXT: s_add_u32 s9, s9, s14
-; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11
+; GFX1064-NEXT: s_add_u32 s12, s4, s5
+; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_mul_i32 s4, s9, s12
+; GFX1064-NEXT: s_addc_u32 s8, s8, s11
+; GFX1064-NEXT: s_mul_i32 s10, s10, s12
+; GFX1064-NEXT: s_mul_i32 s9, s9, s8
+; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4
+; GFX1064-NEXT: s_add_i32 s9, s13, s9
+; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4
+; GFX1064-NEXT: s_add_i32 s9, s9, s10
+; GFX1064-NEXT: s_mul_i32 s4, s8, s4
+; GFX1064-NEXT: s_mul_i32 s14, s12, s9
+; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9
+; GFX1064-NEXT: s_add_u32 s5, s5, s14
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8
-; GFX1064-NEXT: s_add_u32 s9, s9, s10
-; GFX1064-NEXT: s_mul_i32 s8, s4, s8
-; GFX1064-NEXT: s_addc_u32 s9, s13, s12
-; GFX1064-NEXT: s_addc_u32 s10, s11, 0
-; GFX1064-NEXT: s_add_u32 s8, s9, s8
-; GFX1064-NEXT: s_addc_u32 s9, 0, s10
-; GFX1064-NEXT: s_add_u32 s5, s5, s8
-; GFX1064-NEXT: s_addc_u32 s4, s4, s9
-; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5
-; GFX1064-NEXT: s_mul_i32 s11, s2, s4
-; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4
-; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5
+; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9
+; GFX1064-NEXT: s_add_u32 s4, s5, s4
+; GFX1064-NEXT: s_mul_i32 s9, s8, s9
+; GFX1064-NEXT: s_addc_u32 s4, s13, s11
+; GFX1064-NEXT: s_addc_u32 s5, s10, 0
+; GFX1064-NEXT: s_add_u32 s4, s4, s9
+; GFX1064-NEXT: s_addc_u32 s9, 0, s5
+; GFX1064-NEXT: s_add_u32 s10, s12, s4
+; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10
+; GFX1064-NEXT: s_addc_u32 s5, s8, s9
+; GFX1064-NEXT: s_mul_i32 s8, s3, s10
+; GFX1064-NEXT: s_mul_i32 s10, s2, s5
+; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5
+; GFX1064-NEXT: s_add_u32 s10, s11, s10
+; GFX1064-NEXT: s_addc_u32 s9, 0, s9
+; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5
+; GFX1064-NEXT: s_add_u32 s8, s10, s8
; GFX1064-NEXT: s_mul_i32 s5, s3, s5
-; GFX1064-NEXT: s_add_u32 s8, s8, s11
-; GFX1064-NEXT: s_addc_u32 s10, 0, s10
-; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4
-; GFX1064-NEXT: s_add_u32 s5, s8, s5
-; GFX1064-NEXT: s_mul_i32 s4, s3, s4
-; GFX1064-NEXT: s_addc_u32 s5, s10, s9
+; GFX1064-NEXT: s_addc_u32 s4, s9, s4
; GFX1064-NEXT: s_addc_u32 s8, s12, 0
-; GFX1064-NEXT: s_add_u32 s10, s5, s4
+; GFX1064-NEXT: s_add_u32 s10, s4, s5
; GFX1064-NEXT: s_addc_u32 s11, 0, s8
; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10
; GFX1064-NEXT: s_mul_i32 s5, s0, s11
; GFX1064-NEXT: s_mul_i32 s8, s1, s10
; GFX1064-NEXT: s_add_i32 s4, s4, s5
-; GFX1064-NEXT: s_add_i32 s8, s4, s8
+; GFX1064-NEXT: s_add_i32 s12, s4, s8
; GFX1064-NEXT: s_mul_i32 s4, s0, s10
-; GFX1064-NEXT: s_sub_i32 s9, s3, s8
-; GFX1064-NEXT: s_sub_u32 s12, s2, s4
+; GFX1064-NEXT: s_sub_i32 s8, s3, s12
+; GFX1064-NEXT: s_sub_u32 s13, s2, s4
; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_subb_u32 s9, s9, s1
-; GFX1064-NEXT: s_sub_u32 s13, s12, s0
-; GFX1064-NEXT: s_subb_u32 s9, s9, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s9, s1
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_subb_u32 s14, s8, s1
+; GFX1064-NEXT: s_sub_u32 s15, s13, s0
+; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1064-NEXT: s_subb_u32 s8, s14, 0
+; GFX1064-NEXT: s_cmp_ge_u32 s8, s1
+; GFX1064-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1064-NEXT: s_cmp_ge_u32 s15, s0
; GFX1064-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
-; GFX1064-NEXT: s_cselect_b32 s13, -1, 0
-; GFX1064-NEXT: s_cmp_eq_u32 s9, s1
-; GFX1064-NEXT: s_cselect_b32 s9, s13, s14
-; GFX1064-NEXT: s_add_u32 s13, s10, 1
+; GFX1064-NEXT: s_cmp_eq_u32 s8, s1
+; GFX1064-NEXT: s_cselect_b32 s8, s14, s9
+; GFX1064-NEXT: s_add_u32 s9, s10, 1
; GFX1064-NEXT: s_addc_u32 s14, s11, 0
; GFX1064-NEXT: s_add_u32 s15, s10, 2
; GFX1064-NEXT: s_addc_u32 s16, s11, 0
-; GFX1064-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1064-NEXT: s_cselect_b32 s13, s15, s13
+; GFX1064-NEXT: s_cmp_lg_u32 s8, 0
+; GFX1064-NEXT: s_cselect_b32 s15, s15, s9
; GFX1064-NEXT: s_cselect_b32 s14, s16, s14
; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_subb_u32 s3, s3, s8
+; GFX1064-NEXT: s_subb_u32 s3, s3, s12
; GFX1064-NEXT: s_cmp_ge_u32 s3, s1
; GFX1064-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s12, s0
+; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
; GFX1064-NEXT: s_cselect_b32 s5, -1, 0
; GFX1064-NEXT: s_cmp_eq_u32 s3, s1
; GFX1064-NEXT: s_cselect_b32 s1, s5, s4
; GFX1064-NEXT: s_cmp_lg_u32 s1, 0
; GFX1064-NEXT: s_cselect_b32 s5, s14, s11
-; GFX1064-NEXT: s_cselect_b32 s4, s13, s10
+; GFX1064-NEXT: s_cselect_b32 s4, s15, s10
; GFX1064-NEXT: s_cbranch_execnz .LBB15_3
; GFX1064-NEXT: .LBB15_2:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 4445383..64d055b 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -271,6 +271,7 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
+; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -280,6 +281,7 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
+; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -297,6 +299,8 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
+; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -307,6 +311,7 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
+; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0
; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4
; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -316,6 +321,7 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
+; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -333,6 +339,8 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
+; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/Transforms/SCCP/constant-range-struct.ll b/llvm/test/Transforms/SCCP/constant-range-struct.ll
index 7a399df..0f45b38 100644
--- a/llvm/test/Transforms/SCCP/constant-range-struct.ll
+++ b/llvm/test/Transforms/SCCP/constant-range-struct.ll
@@ -25,7 +25,7 @@ true:
br label %exit
false:
- %s.3 = insertvalue {i64, i64} undef, i64 30, 0
+ %s.3 = insertvalue {i64, i64} poison, i64 30, 0
%s.4 = insertvalue {i64, i64} %s.3, i64 300, 1
br label %exit
@@ -39,14 +39,14 @@ define void @struct1_caller() {
; CHECK-NEXT: [[S:%.*]] = call { i64, i64 } @struct1()
; CHECK-NEXT: [[V1:%.*]] = extractvalue { i64, i64 } [[S]], 0
; CHECK-NEXT: [[V2:%.*]] = extractvalue { i64, i64 } [[S]], 1
-; CHECK-NEXT: [[T_1:%.*]] = icmp ne i64 [[V1]], 10
-; CHECK-NEXT: call void @use(i1 [[T_1]])
-; CHECK-NEXT: [[T_2:%.*]] = icmp ult i64 [[V1]], 100
-; CHECK-NEXT: call void @use(i1 [[T_2]])
-; CHECK-NEXT: [[T_3:%.*]] = icmp ne i64 [[V2]], 0
+; CHECK-NEXT: call void @use(i1 true)
+; CHECK-NEXT: call void @use(i1 true)
+; CHECK-NEXT: [[T_3:%.*]] = icmp eq i64 [[V1]], 20
; CHECK-NEXT: call void @use(i1 [[T_3]])
-; CHECK-NEXT: [[T_4:%.*]] = icmp ult i64 [[V2]], 301
-; CHECK-NEXT: call void @use(i1 [[T_4]])
+; CHECK-NEXT: call void @use(i1 true)
+; CHECK-NEXT: call void @use(i1 true)
+; CHECK-NEXT: [[T_6:%.*]] = icmp eq i64 [[V2]], 300
+; CHECK-NEXT: call void @use(i1 [[T_6]])
; CHECK-NEXT: ret void
;
%s = call {i64, i64} @struct1()
@@ -57,10 +57,14 @@ define void @struct1_caller() {
call void @use(i1 %t.1)
%t.2 = icmp ult i64 %v1, 100
call void @use(i1 %t.2)
- %t.3 = icmp ne i64 %v2, 0
+ %t.3 = icmp eq i64 %v1, 20
call void @use(i1 %t.3)
- %t.4 = icmp ult i64 %v2, 301
+ %t.4 = icmp ne i64 %v2, 0
call void @use(i1 %t.4)
+ %t.5 = icmp ult i64 %v2, 301
+ call void @use(i1 %t.5)
+ %t.6 = icmp eq i64 %v2, 300
+ call void @use(i1 %t.6)
ret void
}
@@ -76,7 +80,7 @@ define internal {i64, i64} @struct2() {
; CHECK: exit:
; CHECK-NEXT: [[V1:%.*]] = phi i64 [ 20, [[TRUE]] ], [ 30, [[FALSE]] ]
; CHECK-NEXT: [[V2:%.*]] = phi i64 [ 200, [[TRUE]] ], [ 300, [[FALSE]] ]
-; CHECK-NEXT: [[S_1:%.*]] = insertvalue { i64, i64 } undef, i64 [[V1]], 0
+; CHECK-NEXT: [[S_1:%.*]] = insertvalue { i64, i64 } poison, i64 [[V1]], 0
; CHECK-NEXT: [[S_2:%.*]] = insertvalue { i64, i64 } [[S_1]], i64 [[V2]], 1
; CHECK-NEXT: ret { i64, i64 } [[S_2]]
;
@@ -92,7 +96,7 @@ false:
exit:
%v1 = phi i64 [ 20, %true ], [ 30, %false ]
%v2 = phi i64 [ 200, %true ], [ 300, %false ]
- %s.1 = insertvalue {i64, i64} undef, i64 %v1, 0
+ %s.1 = insertvalue {i64, i64} poison, i64 %v1, 0
%s.2 = insertvalue {i64, i64} %s.1, i64 %v2, 1
ret {i64, i64} %s.2
}
@@ -153,3 +157,40 @@ define void @struct2_caller() {
ret void
}
+
+%"phi_type" = type {i64, i64}
+
+define internal %"phi_type" @test(i32 %input) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: br label [[COND_TRUE_I:%.*]]
+; CHECK: cond.true.i:
+; CHECK-NEXT: br label [[COND_END_I:%.*]]
+; CHECK: cond.end.i:
+; CHECK-NEXT: ret [[PHI_TYPE:%.*]] poison
+;
+ %cmp.cond = icmp eq i32 %input, 1
+ br i1 %cmp.cond, label %cond.true.i, label %cond.false.i
+
+cond.true.i:
+ %r1.tmp = insertvalue %"phi_type" poison, i64 1, 0
+ %r1.tmp.2 = insertvalue %"phi_type" %r1.tmp, i64 2, 1
+ br label %cond.end.i
+
+cond.false.i:
+ %r2.tmp = insertvalue %"phi_type" poison, i64 3, 0
+ %r2.tmp.2 = insertvalue %"phi_type" %r2.tmp, i64 4, 1
+ br label %cond.end.i
+
+cond.end.i:
+ %retval = phi %"phi_type" [ %r1.tmp.2, %cond.true.i ], [ %r2.tmp.2, %cond.false.i ]
+ ret %"phi_type" %retval
+}
+
+define %"phi_type" @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: [[CALL_1:%.*]] = tail call fastcc [[PHI_TYPE:%.*]] @[[TEST:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 noundef 1)
+; CHECK-NEXT: ret [[PHI_TYPE]] { i64 1, i64 2 }
+;
+ %call.1 = tail call fastcc noundef %"phi_type" @test(i32 noundef 1)
+ ret %"phi_type" %call.1
+}
diff --git a/llvm/unittests/ADT/BitTest.cpp b/llvm/unittests/ADT/BitTest.cpp
index eaed4e1..5b3df91 100644
--- a/llvm/unittests/ADT/BitTest.cpp
+++ b/llvm/unittests/ADT/BitTest.cpp
@@ -270,6 +270,22 @@ TEST(BitTest, BitWidthConstexpr) {
llvm::bit_width_constexpr(std::numeric_limits<uint64_t>::max()) == 64);
}
+TEST(BitTest, BitCeilConstexpr) {
+ static_assert(llvm::bit_ceil_constexpr(0u) == 1);
+ static_assert(llvm::bit_ceil_constexpr(1u) == 1);
+ static_assert(llvm::bit_ceil_constexpr(2u) == 2);
+ static_assert(llvm::bit_ceil_constexpr(3u) == 4);
+ static_assert(llvm::bit_ceil_constexpr(4u) == 4);
+ static_assert(llvm::bit_ceil_constexpr(5u) == 8);
+ static_assert(llvm::bit_ceil_constexpr(6u) == 8);
+ static_assert(llvm::bit_ceil_constexpr(7u) == 8);
+ static_assert(llvm::bit_ceil_constexpr(8u) == 8);
+
+ static_assert(llvm::bit_ceil_constexpr(255u) == 256);
+ static_assert(llvm::bit_ceil_constexpr(256u) == 256);
+ static_assert(llvm::bit_ceil_constexpr(257u) == 512);
+}
+
TEST(BitTest, CountlZero) {
uint8_t Z8 = 0;
uint16_t Z16 = 0;