aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--clang-tools-extra/clang-tidy/ClangTidyOptions.cpp2
-rw-r--r--clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp16
-rw-r--r--clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp8
-rw-r--r--clang-tools-extra/clang-tidy/android/CloexecCheck.cpp12
-rw-r--r--clang-tools-extra/clang-tidy/android/CloexecCheck.h4
-rw-r--r--clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp2
-rw-r--r--clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp11
-rw-r--r--clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp19
-rw-r--r--clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp12
-rw-r--r--clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp7
-rw-r--r--clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp53
-rw-r--r--clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp10
-rw-r--r--clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp4
-rw-r--r--clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp2
-rw-r--r--clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp6
-rw-r--r--clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp9
-rw-r--r--clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp9
-rw-r--r--clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp7
-rw-r--r--clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp2
-rw-r--r--clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp4
-rw-r--r--clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp24
-rw-r--r--clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp7
-rw-r--r--clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp6
-rw-r--r--clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp4
-rw-r--r--clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp4
-rw-r--r--clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp27
-rw-r--r--clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp9
-rw-r--r--clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp17
-rw-r--r--clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp4
-rw-r--r--clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp16
-rw-r--r--clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp22
-rw-r--r--clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp15
-rw-r--r--clang/docs/ClangFormatStyleOptions.rst83
-rw-r--r--clang/include/clang/Basic/BuiltinsX86.td9
-rw-r--r--clang/include/clang/Format/Format.h86
-rw-r--r--clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h17
-rw-r--r--clang/lib/AST/ByteCode/InterpBuiltin.cpp9
-rw-r--r--clang/lib/AST/ExprConstant.cpp8
-rw-r--r--clang/lib/Format/BreakableToken.cpp6
-rw-r--r--clang/lib/Format/ContinuationIndenter.cpp9
-rw-r--r--clang/lib/Format/Format.cpp22
-rw-r--r--clang/lib/Format/FormatToken.cpp10
-rw-r--r--clang/lib/Format/TokenAnnotator.cpp23
-rw-r--r--clang/lib/Format/WhitespaceManager.cpp6
-rw-r--r--clang/lib/Headers/avx2intrin.h8
-rw-r--r--clang/lib/Headers/avx512bwintrin.h15
-rw-r--r--clang/lib/Headers/avx512vlbwintrin.h8
-rw-r--r--clang/lib/Headers/tmmintrin.h13
-rw-r--r--clang/lib/Sema/SemaChecking.cpp9
-rw-r--r--clang/lib/Sema/SemaTemplateInstantiate.cpp6
-rw-r--r--clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp2
-rw-r--r--clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp2
-rw-r--r--clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp8
-rw-r--r--clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp4
-rw-r--r--clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h2
-rw-r--r--clang/test/CodeGen/X86/avx2-builtins.c1
-rw-r--r--clang/test/CodeGen/X86/avx512bw-builtins.c6
-rw-r--r--clang/test/CodeGen/X86/avx512vlbw-builtins.c4
-rw-r--r--clang/test/CodeGen/X86/mmx-builtins.c1
-rw-r--r--clang/test/CodeGen/X86/ssse3-builtins.c1
-rw-r--r--clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp8
-rw-r--r--clang/unittests/Format/ConfigParseTest.cpp13
-rw-r--r--clang/unittests/Format/FormatTest.cpp12
-rw-r--r--clang/unittests/Format/FormatTestCSharp.cpp2
-rw-r--r--clang/unittests/Format/FormatTestComments.cpp52
-rw-r--r--clang/unittests/Format/FormatTestJava.cpp2
-rw-r--r--clang/unittests/Format/FormatTestTextProto.cpp2
-rw-r--r--clang/unittests/Format/FormatTestVerilog.cpp2
-rw-r--r--compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp5
-rw-r--r--flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt4
-rw-r--r--libcxx/docs/ReleaseNotes/21.rst3
-rw-r--r--libcxx/docs/ReleaseNotes/22.rst1
-rw-r--r--libcxx/docs/Status/Cxx2cIssues.csv2
-rw-r--r--libcxx/docs/Status/Cxx2cPapers.csv2
-rw-r--r--llvm/benchmarks/SpecialCaseListBM.cpp29
-rw-r--r--llvm/include/llvm/ADT/Bitfields.h9
-rw-r--r--llvm/include/llvm/ADT/DenseMap.h2
-rw-r--r--llvm/include/llvm/ADT/DepthFirstIterator.h18
-rw-r--r--llvm/include/llvm/ADT/ImmutableSet.h6
-rw-r--r--llvm/include/llvm/ADT/PostOrderIterator.h6
-rw-r--r--llvm/include/llvm/ADT/STLExtras.h2
-rw-r--r--llvm/include/llvm/ADT/SmallPtrSet.h12
-rw-r--r--llvm/include/llvm/ADT/bit.h15
-rw-r--r--llvm/include/llvm/Support/Alignment.h2
-rw-r--r--llvm/include/llvm/Support/Casting.h7
-rw-r--r--llvm/include/llvm/Support/CommandLine.h2
-rw-r--r--llvm/include/llvm/Support/DOTGraphTraits.h3
-rw-r--r--llvm/include/llvm/Support/ELFAttributes.h2
-rw-r--r--llvm/include/llvm/Support/LSP/Protocol.h2
-rw-r--r--llvm/include/llvm/Support/MD5.h2
-rw-r--r--llvm/include/llvm/Support/Timer.h2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h9
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h24
-rw-r--r--llvm/lib/Transforms/Utils/SCCPSolver.cpp96
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp8
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/abs_i32.ll92
-rw-r--r--llvm/test/CodeGen/AMDGPU/addsub64_carry.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll1260
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll135
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll210
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll585
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll614
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll35
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/optimize-compare.mir82
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_cmp_0.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem.ll654
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem64.ll207
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv64.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/urem64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll190
-rw-r--r--llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll8
-rw-r--r--llvm/test/Transforms/SCCP/constant-range-struct.ll65
-rw-r--r--llvm/unittests/ADT/BitTest.cpp16
-rw-r--r--mlir/include/mlir/Dialect/SMT/IR/SMTOps.td2
-rw-r--r--mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h1
-rw-r--r--mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td17
-rw-r--r--mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp83
-rw-r--r--mlir/python/mlir/dialects/transform/smt.py12
-rw-r--r--mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir116
-rw-r--r--mlir/test/Dialect/Transform/test-smt-extension.mlir21
-rw-r--r--mlir/test/python/dialects/transform_smt_ext.py30
146 files changed, 4256 insertions, 2844 deletions
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
index b752a9b..21455db 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
@@ -154,6 +154,7 @@ template <> struct ScalarEnumerationTraits<clang::DiagnosticIDs::Level> {
}
};
template <> struct SequenceElementTraits<ClangTidyOptions::CustomCheckDiag> {
+ // NOLINTNEXTLINE(readability-identifier-naming) Defined by YAMLTraits.h
static const bool flow = false;
};
template <> struct MappingTraits<ClangTidyOptions::CustomCheckDiag> {
@@ -165,6 +166,7 @@ template <> struct MappingTraits<ClangTidyOptions::CustomCheckDiag> {
}
};
template <> struct SequenceElementTraits<ClangTidyOptions::CustomCheckValue> {
+ // NOLINTNEXTLINE(readability-identifier-naming) Defined by YAMLTraits.h
static const bool flow = false;
};
template <> struct MappingTraits<ClangTidyOptions::CustomCheckValue> {
diff --git a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp
index d7cc0ca..a58c041 100644
--- a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp
@@ -45,7 +45,10 @@ struct StrCatCheckResult {
std::vector<FixItHint> Hints;
};
-void removeCallLeaveArgs(const CallExpr *Call, StrCatCheckResult *CheckResult) {
+} // namespace
+
+static void removeCallLeaveArgs(const CallExpr *Call,
+ StrCatCheckResult *CheckResult) {
if (Call->getNumArgs() == 0)
return;
// Remove 'Foo('
@@ -58,9 +61,9 @@ void removeCallLeaveArgs(const CallExpr *Call, StrCatCheckResult *CheckResult) {
Call->getRParenLoc(), Call->getEndLoc().getLocWithOffset(1))));
}
-const clang::CallExpr *processArgument(const Expr *Arg,
- const MatchFinder::MatchResult &Result,
- StrCatCheckResult *CheckResult) {
+static const clang::CallExpr *
+processArgument(const Expr *Arg, const MatchFinder::MatchResult &Result,
+ StrCatCheckResult *CheckResult) {
const auto IsAlphanum = hasDeclaration(cxxMethodDecl(hasName("AlphaNum")));
static const auto *const Strcat = new auto(hasName("::absl::StrCat"));
const auto IsStrcat = cxxBindTemporaryExpr(
@@ -78,8 +81,8 @@ const clang::CallExpr *processArgument(const Expr *Arg,
return nullptr;
}
-StrCatCheckResult processCall(const CallExpr *RootCall, bool IsAppend,
- const MatchFinder::MatchResult &Result) {
+static StrCatCheckResult processCall(const CallExpr *RootCall, bool IsAppend,
+ const MatchFinder::MatchResult &Result) {
StrCatCheckResult CheckResult;
std::deque<const CallExpr *> CallsToProcess = {RootCall};
@@ -101,7 +104,6 @@ StrCatCheckResult processCall(const CallExpr *RootCall, bool IsAppend,
}
return CheckResult;
}
-} // namespace
void RedundantStrcatCallsCheck::check(const MatchFinder::MatchResult &Result) {
bool IsAppend = false;
diff --git a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
index 6aad3c6..e90cdd0 100644
--- a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
@@ -215,13 +215,13 @@ bool UnrollLoopsCheck::hasLargeNumIterations(const Stmt *Statement,
break;
case (BO_MulAssign):
Iterations =
- 1 + (std::log((double)EndValue) - std::log((double)InitValue)) /
- std::log((double)ConstantValue);
+ 1 + ((std::log((double)EndValue) - std::log((double)InitValue)) /
+ std::log((double)ConstantValue));
break;
case (BO_DivAssign):
Iterations =
- 1 + (std::log((double)InitValue) - std::log((double)EndValue)) /
- std::log((double)ConstantValue);
+ 1 + ((std::log((double)InitValue) - std::log((double)EndValue)) /
+ std::log((double)ConstantValue));
break;
default:
// All other operators are not handled; assume large bounds.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
index cd83423..48c54c0 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
@@ -16,12 +16,13 @@ using namespace clang::ast_matchers;
namespace clang::tidy::android {
-namespace {
// Helper function to form the correct string mode for Type3.
// Build the replace text. If it's string constant, add <Mode> directly in the
// end of the string. Else, add <Mode>.
-std::string buildFixMsgForStringFlag(const Expr *Arg, const SourceManager &SM,
- const LangOptions &LangOpts, char Mode) {
+static std::string buildFixMsgForStringFlag(const Expr *Arg,
+ const SourceManager &SM,
+ const LangOptions &LangOpts,
+ char Mode) {
if (Arg->getBeginLoc().isMacroID())
return (Lexer::getSourceText(
CharSourceRange::getTokenRange(Arg->getSourceRange()), SM,
@@ -32,11 +33,6 @@ std::string buildFixMsgForStringFlag(const Expr *Arg, const SourceManager &SM,
StringRef SR = cast<StringLiteral>(Arg->IgnoreParenCasts())->getString();
return ("\"" + SR + Twine(Mode) + "\"").str();
}
-} // namespace
-
-const char *CloexecCheck::FuncDeclBindingStr = "funcDecl";
-
-const char *CloexecCheck::FuncBindingStr = "func";
void CloexecCheck::registerMatchersImpl(
MatchFinder *Finder, internal::Matcher<FunctionDecl> Function) {
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.h b/clang-tools-extra/clang-tidy/android/CloexecCheck.h
index 79f7ab3..b2b59f5 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.h
@@ -89,10 +89,10 @@ protected:
int N) const;
/// Binding name of the FuncDecl of a function call.
- static const char *FuncDeclBindingStr;
+ static constexpr char FuncDeclBindingStr[] = "funcDecl";
/// Binding name of the function call expression.
- static const char *FuncBindingStr;
+ static constexpr char FuncBindingStr[] = "func";
};
} // namespace clang::tidy::android
diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
index d8207b3..b4ee351 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
@@ -1074,7 +1074,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
WorkType = To;
}
- if (Ctx.hasSameType(WorkType, To)) {
+ if (ASTContext::hasSameType(WorkType, To)) {
LLVM_DEBUG(llvm::dbgs() << "<<< approximateStdConv. Reached 'To' type.\n");
return {Ctx.getCommonSugaredType(WorkType, To)};
}
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp
index 390f3dd..54ed899 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp
@@ -18,8 +18,11 @@ namespace {
AST_MATCHER(Expr, isInMacro) { return Node.getBeginLoc().isMacroID(); }
+} // namespace
+
/// Find the next statement after `S`.
-const Stmt *nextStmt(const MatchFinder::MatchResult &Result, const Stmt *S) {
+static const Stmt *nextStmt(const MatchFinder::MatchResult &Result,
+ const Stmt *S) {
auto Parents = Result.Context->getParents(*S);
if (Parents.empty())
return nullptr;
@@ -40,8 +43,8 @@ using ExpansionRanges = std::vector<SourceRange>;
/// \brief Get all the macro expansion ranges related to `Loc`.
///
/// The result is ordered from most inner to most outer.
-ExpansionRanges getExpansionRanges(SourceLocation Loc,
- const MatchFinder::MatchResult &Result) {
+static ExpansionRanges
+getExpansionRanges(SourceLocation Loc, const MatchFinder::MatchResult &Result) {
ExpansionRanges Locs;
while (Loc.isMacroID()) {
Locs.push_back(
@@ -51,8 +54,6 @@ ExpansionRanges getExpansionRanges(SourceLocation Loc,
return Locs;
}
-} // namespace
-
void MultipleStatementMacroCheck::registerMatchers(MatchFinder *Finder) {
const auto Inner = expr(isInMacro(), unless(compoundStmt())).bind("inner");
Finder->addMatcher(
diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
index 86af5cb..c262b1c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
@@ -245,12 +245,10 @@ struct OptionEnumMapping<
namespace bugprone {
-namespace {
-
/// Returns if a function is declared inside a system header.
/// These functions are considered to be "standard" (system-provided) library
/// functions.
-bool isStandardFunction(const FunctionDecl *FD) {
+static bool isStandardFunction(const FunctionDecl *FD) {
// Find a possible redeclaration in system header.
// FIXME: Looking at the canonical declaration is not the most exact way
// to do this.
@@ -284,7 +282,7 @@ bool isStandardFunction(const FunctionDecl *FD) {
/// Check if a statement is "C++-only".
/// This includes all statements that have a class name with "CXX" prefix
/// and every other statement that is declared in file ExprCXX.h.
-bool isCXXOnlyStmt(const Stmt *S) {
+static bool isCXXOnlyStmt(const Stmt *S) {
StringRef Name = S->getStmtClassName();
if (Name.starts_with("CXX"))
return true;
@@ -304,7 +302,8 @@ bool isCXXOnlyStmt(const Stmt *S) {
/// called from \p Caller, get a \c CallExpr of the corresponding function call.
/// It is unspecified which call is found if multiple calls exist, but the order
/// should be deterministic (depend only on the AST).
-Expr *findCallExpr(const CallGraphNode *Caller, const CallGraphNode *Callee) {
+static Expr *findCallExpr(const CallGraphNode *Caller,
+ const CallGraphNode *Callee) {
const auto *FoundCallee = llvm::find_if(
Caller->callees(), [Callee](const CallGraphNode::CallRecord &Call) {
return Call.Callee == Callee;
@@ -314,7 +313,7 @@ Expr *findCallExpr(const CallGraphNode *Caller, const CallGraphNode *Callee) {
return FoundCallee->CallExpr;
}
-SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) {
+static SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) {
ParentMapContext &PM = Ctx.getParentMapContext();
DynTypedNode P = DynTypedNode::create(*S);
while (P.getSourceRange().isInvalid()) {
@@ -326,9 +325,9 @@ SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) {
return P.getSourceRange();
}
-AST_MATCHER(FunctionDecl, isStandardFunction) {
- return isStandardFunction(&Node);
-}
+namespace {
+
+AST_MATCHER(FunctionDecl, isStandard) { return isStandardFunction(&Node); }
} // namespace
@@ -354,7 +353,7 @@ bool SignalHandlerCheck::isLanguageVersionSupported(
void SignalHandlerCheck::registerMatchers(MatchFinder *Finder) {
auto SignalFunction = functionDecl(hasAnyName("::signal", "::std::signal"),
- parameterCountIs(2), isStandardFunction());
+ parameterCountIs(2), isStandard());
auto HandlerExpr =
declRefExpr(hasDeclaration(functionDecl().bind("handler_decl")),
unless(isExpandedFromMacro("SIG_IGN")),
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
index cdb6a08..cf55dd7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
@@ -424,7 +424,7 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
"suspicious usage of 'sizeof(array)/sizeof(...)';"
" denominator differs from the size of array elements")
<< E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
- } else if (NumTy && DenomTy && Ctx.hasSameType(NumTy, DenomTy) &&
+ } else if (NumTy && DenomTy && ASTContext::hasSameType(NumTy, DenomTy) &&
!NumTy->isDependentType()) {
// Dependent type should not be compared.
diag(E->getOperatorLoc(),
@@ -433,7 +433,7 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
<< E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
} else if (!WarnOnSizeOfPointer) {
// When 'WarnOnSizeOfPointer' is enabled, these messages become redundant:
- if (PointedTy && DenomTy && Ctx.hasSameType(PointedTy, DenomTy)) {
+ if (PointedTy && DenomTy && ASTContext::hasSameType(PointedTy, DenomTy)) {
diag(E->getOperatorLoc(),
"suspicious usage of 'sizeof(...)/sizeof(...)'; size of pointer "
"is divided by size of pointed type")
@@ -462,8 +462,8 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
const auto *SizeOfExpr =
Result.Nodes.getNodeAs<UnaryExprOrTypeTraitExpr>("sizeof-ptr-mul-expr");
- if (Ctx.hasSameType(LPtrTy, RPtrTy) &&
- Ctx.hasSameType(LPtrTy, SizeofArgTy)) {
+ if (ASTContext::hasSameType(LPtrTy, RPtrTy) &&
+ ASTContext::hasSameType(LPtrTy, SizeofArgTy)) {
diag(SizeOfExpr->getBeginLoc(), "suspicious usage of 'sizeof(...)' in "
"pointer arithmetic")
<< SizeOfExpr->getSourceRange() << E->getOperatorLoc()
@@ -477,8 +477,8 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
const auto *SizeOfExpr =
Result.Nodes.getNodeAs<UnaryExprOrTypeTraitExpr>("sizeof-ptr-div-expr");
- if (Ctx.hasSameType(LPtrTy, RPtrTy) &&
- Ctx.hasSameType(LPtrTy, SizeofArgTy)) {
+ if (ASTContext::hasSameType(LPtrTy, RPtrTy) &&
+ ASTContext::hasSameType(LPtrTy, SizeofArgTy)) {
diag(SizeOfExpr->getBeginLoc(), "suspicious usage of 'sizeof(...)' in "
"pointer arithmetic")
<< SizeOfExpr->getSourceRange() << E->getOperatorLoc()
diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
index 0c8d2b8..cef8b4d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
@@ -50,7 +50,7 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context,
return false;
// Check if return types are identical.
- if (Context->hasSameType(DerivedReturnTy, BaseReturnTy))
+ if (ASTContext::hasSameType(DerivedReturnTy, BaseReturnTy))
return true;
/// Check if the return types are covariant.
@@ -77,7 +77,7 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context,
if (DRD == BRD)
return true;
- if (!Context->hasSameUnqualifiedType(DTy, BTy)) {
+ if (!ASTContext::hasSameUnqualifiedType(DTy, BTy)) {
// Begin checking whether the conversion from D to B is valid.
CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
/*DetectVirtual=*/false);
@@ -87,7 +87,8 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context,
return false;
// Check ambiguity.
- if (Paths.isAmbiguous(Context->getCanonicalType(BTy).getUnqualifiedType()))
+ if (Paths.isAmbiguous(
+ ASTContext::getCanonicalType(BTy).getUnqualifiedType()))
return false;
// Check accessibility.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
index 37d737a..1ac9b8b 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
@@ -28,10 +28,13 @@ AST_MATCHER(CXXRecordDecl, hasDefaultConstructor) {
return Node.hasDefaultConstructor();
}
+} // namespace
+
// Iterate over all the fields in a record type, both direct and indirect (e.g.
// if the record contains an anonymous struct).
template <typename T, typename Func>
-void forEachField(const RecordDecl &Record, const T &Fields, const Func &Fn) {
+static void forEachField(const RecordDecl &Record, const T &Fields,
+ const Func &Fn) {
for (const FieldDecl *F : Fields) {
if (F->isAnonymousStructOrUnion()) {
if (const CXXRecordDecl *R = F->getType()->getAsCXXRecordDecl())
@@ -43,8 +46,9 @@ void forEachField(const RecordDecl &Record, const T &Fields, const Func &Fn) {
}
template <typename T, typename Func>
-void forEachFieldWithFilter(const RecordDecl &Record, const T &Fields,
- bool &AnyMemberHasInitPerUnion, const Func &Fn) {
+static void forEachFieldWithFilter(const RecordDecl &Record, const T &Fields,
+ bool &AnyMemberHasInitPerUnion,
+ const Func &Fn) {
for (const FieldDecl *F : Fields) {
if (F->isAnonymousStructOrUnion()) {
if (const CXXRecordDecl *R = F->getType()->getAsCXXRecordDecl()) {
@@ -59,8 +63,9 @@ void forEachFieldWithFilter(const RecordDecl &Record, const T &Fields,
}
}
-void removeFieldInitialized(const FieldDecl *M,
- SmallPtrSetImpl<const FieldDecl *> &FieldDecls) {
+static void
+removeFieldInitialized(const FieldDecl *M,
+ SmallPtrSetImpl<const FieldDecl *> &FieldDecls) {
const RecordDecl *R = M->getParent();
if (R && R->isUnion()) {
// Erase all members in a union if any member of it is initialized.
@@ -70,9 +75,9 @@ void removeFieldInitialized(const FieldDecl *M,
FieldDecls.erase(M);
}
-void removeFieldsInitializedInBody(
- const Stmt &Stmt, ASTContext &Context,
- SmallPtrSetImpl<const FieldDecl *> &FieldDecls) {
+static void
+removeFieldsInitializedInBody(const Stmt &Stmt, ASTContext &Context,
+ SmallPtrSetImpl<const FieldDecl *> &FieldDecls) {
auto Matches =
match(findAll(binaryOperator(
hasOperatorName("="),
@@ -82,9 +87,9 @@ void removeFieldsInitializedInBody(
removeFieldInitialized(Match.getNodeAs<FieldDecl>("fieldDecl"), FieldDecls);
}
-StringRef getName(const FieldDecl *Field) { return Field->getName(); }
+static StringRef getName(const FieldDecl *Field) { return Field->getName(); }
-StringRef getName(const RecordDecl *Record) {
+static StringRef getName(const RecordDecl *Record) {
// Get the typedef name if this is a C-style anonymous struct and typedef.
if (const TypedefNameDecl *Typedef = Record->getTypedefNameForAnonDecl())
return Typedef->getName();
@@ -94,7 +99,7 @@ StringRef getName(const RecordDecl *Record) {
// Creates comma separated list of decls requiring initialization in order of
// declaration.
template <typename R, typename T>
-std::string
+static std::string
toCommaSeparatedString(const R &OrderedDecls,
const SmallPtrSetImpl<const T *> &DeclsToInit) {
SmallVector<StringRef, 16> Names;
@@ -105,12 +110,14 @@ toCommaSeparatedString(const R &OrderedDecls,
return llvm::join(Names.begin(), Names.end(), ", ");
}
-SourceLocation getLocationForEndOfToken(const ASTContext &Context,
- SourceLocation Location) {
+static SourceLocation getLocationForEndOfToken(const ASTContext &Context,
+ SourceLocation Location) {
return Lexer::getLocForEndOfToken(Location, 0, Context.getSourceManager(),
Context.getLangOpts());
}
+namespace {
+
// There are 3 kinds of insertion placements:
enum class InitializerPlacement {
// 1. The fields are inserted after an existing CXXCtorInitializer stored in
@@ -187,15 +194,17 @@ struct InitializerInsertion {
SmallVector<std::string, 4> Initializers;
};
+} // namespace
+
// Convenience utility to get a RecordDecl from a QualType.
-const RecordDecl *getCanonicalRecordDecl(const QualType &Type) {
+static const RecordDecl *getCanonicalRecordDecl(const QualType &Type) {
if (const auto *RT = Type->getAsCanonical<RecordType>())
return RT->getDecl();
return nullptr;
}
template <typename R, typename T>
-SmallVector<InitializerInsertion, 16>
+static SmallVector<InitializerInsertion, 16>
computeInsertions(const CXXConstructorDecl::init_const_range &Inits,
const R &OrderedDecls,
const SmallPtrSetImpl<const T *> &DeclsToInit) {
@@ -239,8 +248,9 @@ computeInsertions(const CXXConstructorDecl::init_const_range &Inits,
// Gets the list of bases and members that could possibly be initialized, in
// order as they appear in the class declaration.
-void getInitializationsInOrder(const CXXRecordDecl &ClassDecl,
- SmallVectorImpl<const NamedDecl *> &Decls) {
+static void
+getInitializationsInOrder(const CXXRecordDecl &ClassDecl,
+ SmallVectorImpl<const NamedDecl *> &Decls) {
Decls.clear();
for (const auto &Base : ClassDecl.bases()) {
// Decl may be null if the base class is a template parameter.
@@ -253,9 +263,10 @@ void getInitializationsInOrder(const CXXRecordDecl &ClassDecl,
}
template <typename T>
-void fixInitializerList(const ASTContext &Context, DiagnosticBuilder &Diag,
- const CXXConstructorDecl *Ctor,
- const SmallPtrSetImpl<const T *> &DeclsToInit) {
+static void fixInitializerList(const ASTContext &Context,
+ DiagnosticBuilder &Diag,
+ const CXXConstructorDecl *Ctor,
+ const SmallPtrSetImpl<const T *> &DeclsToInit) {
// Do not propose fixes in macros since we cannot place them correctly.
if (Ctor->getBeginLoc().isMacroID())
return;
@@ -271,8 +282,6 @@ void fixInitializerList(const ASTContext &Context, DiagnosticBuilder &Diag,
}
}
-} // anonymous namespace
-
ProTypeMemberInitCheck::ProTypeMemberInitCheck(StringRef Name,
ClangTidyContext *Context)
: ClangTidyCheck(Name, Context),
diff --git a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
index 0d81b9a..bd51cc5 100644
--- a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
@@ -111,10 +111,10 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
}
RewriteRuleWith<std::string> useNewMlirOpBuilderCheckRule() {
- Stencil message = cat("use 'OpType::create(builder, ...)' instead of "
+ Stencil Message = cat("use 'OpType::create(builder, ...)' instead of "
"'builder.create<OpType>(...)'");
// Match a create call on an OpBuilder.
- ast_matchers::internal::Matcher<Stmt> base =
+ ast_matchers::internal::Matcher<Stmt> Base =
cxxMemberCallExpr(
on(expr(hasType(
cxxRecordDecl(isSameOrDerivedFrom("::mlir::OpBuilder"))))
@@ -124,10 +124,10 @@ RewriteRuleWith<std::string> useNewMlirOpBuilderCheckRule() {
.bind("call");
return applyFirst(
// Attempt rewrite given an lvalue builder, else just warn.
- {makeRule(cxxMemberCallExpr(unless(on(cxxTemporaryObjectExpr())), base),
+ {makeRule(cxxMemberCallExpr(unless(on(cxxTemporaryObjectExpr())), Base),
rewrite(node("call"), node("builder"), callArgs("call")),
- message),
- makeRule(base, noopEdit(node("call")), message)});
+ Message),
+ makeRule(Base, noopEdit(node("call")), Message)});
}
} // namespace
diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
index 8ec7695..3b9b8e0 100644
--- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
@@ -60,12 +60,12 @@ AST_MATCHER_P(CoawaitExpr, awaitable, ast_matchers::internal::Matcher<Expr>,
return InnerMatcher.matches(*E, Finder, Builder);
return false;
}
+} // namespace
-auto typeWithNameIn(const std::vector<StringRef> &Names) {
+static auto typeWithNameIn(const std::vector<StringRef> &Names) {
return hasType(
hasCanonicalType(hasDeclaration(namedDecl(hasAnyName(Names)))));
}
-} // namespace
CoroutineHostileRAIICheck::CoroutineHostileRAIICheck(StringRef Name,
ClangTidyContext *Context)
diff --git a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
index 5e0f32a..9801c9e 100644
--- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
@@ -53,7 +53,7 @@ AST_MATCHER(FunctionDecl, isPlacementOverload) {
const auto *FPT = Node.getType()->castAs<FunctionProtoType>();
ASTContext &Ctx = Node.getASTContext();
if (Ctx.getLangOpts().SizedDeallocation &&
- Ctx.hasSameType(FPT->getParamType(1), Ctx.getSizeType()))
+ ASTContext::hasSameType(FPT->getParamType(1), Ctx.getSizeType()))
return false;
return true;
diff --git a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
index 0d7667c..035598d 100644
--- a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
@@ -151,10 +151,12 @@ constexpr unsigned SmallSCCSize = 32;
using CallStackTy =
llvm::SmallVector<CallGraphNode::CallRecord, SmallCallStackSize>;
+} // namespace
+
// In given SCC, find *some* call stack that will be cyclic.
// This will only find *one* such stack, it might not be the smallest one,
// and there may be other loops.
-CallStackTy pathfindSomeCycle(ArrayRef<CallGraphNode *> SCC) {
+static CallStackTy pathfindSomeCycle(ArrayRef<CallGraphNode *> SCC) {
// We'll need to be able to performantly look up whether some CallGraphNode
// is in SCC or not, so cache all the SCC elements in a set.
const ImmutableSmallSet<CallGraphNode *, SmallSCCSize> SCCElts(SCC);
@@ -190,8 +192,6 @@ CallStackTy pathfindSomeCycle(ArrayRef<CallGraphNode *> SCC) {
return CallStack;
}
-} // namespace
-
void NoRecursionCheck::registerMatchers(MatchFinder *Finder) {
Finder->addMatcher(translationUnitDecl().bind("TUDecl"), this);
}
diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
index 17a8a50..6baa12a 100644
--- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
@@ -29,7 +29,6 @@ using namespace clang::ast_matchers;
using namespace clang::tidy::matchers;
namespace clang::tidy::misc {
-namespace {
using llvm::APSInt;
static constexpr llvm::StringLiteral KnownBannedMacroNames[] = {
@@ -420,6 +419,8 @@ markDuplicateOperands(const TExpr *TheExpr,
return Duplicates.any();
}
+namespace {
+
AST_MATCHER(Expr, isIntegerConstantExpr) {
if (Node.isInstantiationDependent())
return false;
@@ -470,6 +471,8 @@ AST_MATCHER_P(Expr, expandedByMacro, ArrayRef<llvm::StringLiteral>, Names) {
return false;
}
+} // namespace
+
// Returns a matcher for integer constant expressions.
static ast_matchers::internal::Matcher<Expr>
matchIntegerConstantExpr(StringRef Id) {
@@ -805,7 +808,8 @@ static bool isSameRawIdentifierToken(const Token &T1, const Token &T2,
StringRef(SM.getCharacterData(T2.getLocation()), T2.getLength());
}
-bool isTokAtEndOfExpr(SourceRange ExprSR, Token T, const SourceManager &SM) {
+static bool isTokAtEndOfExpr(SourceRange ExprSR, Token T,
+ const SourceManager &SM) {
return SM.getExpansionLoc(ExprSR.getEnd()) == T.getLocation();
}
@@ -921,7 +925,6 @@ static bool areExprsSameMacroOrLiteral(const BinaryOperator *BinOp,
return false;
}
-} // namespace
void RedundantExpressionCheck::registerMatchers(MatchFinder *Finder) {
const auto BannedIntegerLiteral =
diff --git a/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp b/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp
index 27ddb7c..ab2077b 100644
--- a/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp
@@ -53,9 +53,8 @@ void UniqueptrResetReleaseCheck::registerMatchers(MatchFinder *Finder) {
this);
}
-namespace {
-const Type *getDeleterForUniquePtr(const MatchFinder::MatchResult &Result,
- StringRef ID) {
+static const Type *
+getDeleterForUniquePtr(const MatchFinder::MatchResult &Result, StringRef ID) {
const auto *Class =
Result.Nodes.getNodeAs<ClassTemplateSpecializationDecl>(ID);
if (!Class)
@@ -66,7 +65,7 @@ const Type *getDeleterForUniquePtr(const MatchFinder::MatchResult &Result,
return DeleterArgument.getAsType().getTypePtr();
}
-bool areDeletersCompatible(const MatchFinder::MatchResult &Result) {
+static bool areDeletersCompatible(const MatchFinder::MatchResult &Result) {
const Type *LeftDeleterType = getDeleterForUniquePtr(Result, "left_class");
const Type *RightDeleterType = getDeleterForUniquePtr(Result, "right_class");
@@ -103,8 +102,6 @@ bool areDeletersCompatible(const MatchFinder::MatchResult &Result) {
return false;
}
-} // namespace
-
void UniqueptrResetReleaseCheck::check(const MatchFinder::MatchResult &Result) {
if (!areDeletersCompatible(Result))
return;
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
index 37482583..fea5ac6 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
@@ -499,7 +499,7 @@ static bool canBeModified(ASTContext *Context, const Expr *E) {
return true;
if (const auto *Cast = Parents[0].get<ImplicitCastExpr>()) {
if ((Cast->getCastKind() == CK_NoOp &&
- Context->hasSameType(Cast->getType(), E->getType().withConst())) ||
+ ASTContext::hasSameType(Cast->getType(), E->getType().withConst())) ||
(Cast->getCastKind() == CK_LValueToRValue &&
!Cast->getType().isNull() && Cast->getType()->isFundamentalType()))
return false;
@@ -664,7 +664,8 @@ void LoopConvertCheck::doConversion(
AliasVarIsRef = true;
}
if (Descriptor.ElemType.isNull() ||
- !Context->hasSameUnqualifiedType(AliasVarType, Descriptor.ElemType))
+ !ASTContext::hasSameUnqualifiedType(AliasVarType,
+ Descriptor.ElemType))
Descriptor.ElemType = AliasVarType;
}
@@ -944,7 +945,7 @@ bool LoopConvertCheck::isConvertible(ASTContext *Context,
CanonicalInitVarType->isPointerType()) {
// If the initializer and the variable are both pointers check if the
// un-qualified pointee types match, otherwise we don't use auto.
- return Context->hasSameUnqualifiedType(
+ return ASTContext::hasSameUnqualifiedType(
CanonicalBeginType->getPointeeType(),
CanonicalInitVarType->getPointeeType());
}
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
index 286c39b..586deea 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
@@ -370,7 +370,7 @@ static bool isAliasDecl(ASTContext *Context, const Decl *TheDecl,
DeclarationType = DeclarationType.getNonReferenceType();
if (InitType.isNull() || DeclarationType.isNull() ||
- !Context->hasSameUnqualifiedType(DeclarationType, InitType))
+ !ASTContext::hasSameUnqualifiedType(DeclarationType, InitType))
return false;
}
diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
index c7fd0a9..01796a6 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
@@ -316,7 +316,7 @@ void UseAutoCheck::replaceIterators(const DeclStmt *D, ASTContext *Context) {
if (NestedConstruct->getConstructor()->isConvertingConstructor(false))
return;
}
- if (!Context->hasSameType(V->getType(), E->getType()))
+ if (!ASTContext::hasSameType(V->getType(), E->getType()))
return;
}
@@ -378,7 +378,7 @@ void UseAutoCheck::replaceExpr(
return;
// If VarDecl and Initializer have mismatching unqualified types.
- if (!Context->hasSameUnqualifiedType(V->getType(), GetType(Expr)))
+ if (!ASTContext::hasSameUnqualifiedType(V->getType(), GetType(Expr)))
return;
// All subsequent variables in this declaration should have the same
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
index b921819..b6834c6 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
@@ -21,8 +21,6 @@ using namespace llvm;
namespace clang::tidy::modernize {
namespace {
-const char CastSequence[] = "sequence";
-
AST_MATCHER(Type, sugaredNullptrType) {
const Type *DesugaredType = Node.getUnqualifiedDesugaredType();
if (const auto *BT = dyn_cast<BuiltinType>(DesugaredType))
@@ -30,6 +28,10 @@ AST_MATCHER(Type, sugaredNullptrType) {
return false;
}
+} // namespace
+
+static const char CastSequence[] = "sequence";
+
/// Create a matcher that finds implicit casts as well as the head of a
/// sequence of zero or more nested explicit casts that have an implicit cast
/// to null within.
@@ -43,7 +45,8 @@ AST_MATCHER(Type, sugaredNullptrType) {
/// would check for the "NULL" macro instead, but that'd be harder to express.
/// In practice, "NULL" is often defined as "__null", and this is a useful
/// condition.
-StatementMatcher makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) {
+static StatementMatcher
+makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) {
auto ImplicitCastToNull = implicitCastExpr(
anyOf(hasCastKind(CK_NullToPointer), hasCastKind(CK_NullToMemberPointer)),
anyOf(hasSourceExpression(gnuNullExpr()),
@@ -79,16 +82,16 @@ StatementMatcher makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) {
unless(hasAncestor(functionDecl(isDefaulted()))))));
}
-bool isReplaceableRange(SourceLocation StartLoc, SourceLocation EndLoc,
- const SourceManager &SM) {
+static bool isReplaceableRange(SourceLocation StartLoc, SourceLocation EndLoc,
+ const SourceManager &SM) {
return SM.isWrittenInSameFile(StartLoc, EndLoc);
}
/// Replaces the provided range with the text "nullptr", but only if
/// the start and end location are both in main file.
/// Returns true if and only if a replacement was made.
-void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM,
- SourceLocation StartLoc, SourceLocation EndLoc) {
+static void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM,
+ SourceLocation StartLoc, SourceLocation EndLoc) {
CharSourceRange Range(SourceRange(StartLoc, EndLoc), true);
// Add a space if nullptr follows an alphanumeric character. This happens
// whenever there is an c-style explicit cast to nullptr not surrounded by
@@ -106,8 +109,9 @@ void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM,
/// #define MY_NULL NULL
/// \endcode
/// If \p Loc points to NULL, this function will return the name MY_NULL.
-StringRef getOutermostMacroName(SourceLocation Loc, const SourceManager &SM,
- const LangOptions &LO) {
+static StringRef getOutermostMacroName(SourceLocation Loc,
+ const SourceManager &SM,
+ const LangOptions &LO) {
assert(Loc.isMacroID());
SourceLocation OutermostMacroLoc;
@@ -119,6 +123,8 @@ StringRef getOutermostMacroName(SourceLocation Loc, const SourceManager &SM,
return Lexer::getImmediateMacroName(OutermostMacroLoc, SM, LO);
}
+namespace {
+
/// RecursiveASTVisitor for ensuring all nodes rooted at a given AST
/// subtree that have file-level source locations corresponding to a macro
/// argument have implicit NullTo(Member)Pointer nodes as ancestors.
diff --git a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp
index d26480f..7c90130 100644
--- a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp
@@ -17,9 +17,8 @@ using namespace clang::ast_matchers;
namespace clang::tidy::performance {
-namespace {
-
-std::optional<std::string> makeCharacterLiteral(const StringLiteral *Literal) {
+static std::optional<std::string>
+makeCharacterLiteral(const StringLiteral *Literal) {
std::string Result;
{
llvm::raw_string_ostream OS(Result);
@@ -43,6 +42,8 @@ std::optional<std::string> makeCharacterLiteral(const StringLiteral *Literal) {
return Result;
}
+namespace {
+
AST_MATCHER_FUNCTION(ast_matchers::internal::Matcher<Expr>,
hasSubstitutedType) {
return hasType(qualType(anyOf(substTemplateTypeParmType(),
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
index 3da1469..4a8f292 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
@@ -17,8 +17,6 @@ using namespace clang::ast_matchers;
namespace clang::tidy::performance {
-namespace {
-
// Matcher names. Given the code:
//
// \code
@@ -60,12 +58,14 @@ static const char LoopInitVarName[] = "loop_init_var";
static const char LoopEndExprName[] = "loop_end_expr";
static const char RangeLoopName[] = "for_range_loop";
-ast_matchers::internal::Matcher<Expr> supportedContainerTypesMatcher() {
+static ast_matchers::internal::Matcher<Expr> supportedContainerTypesMatcher() {
return hasType(cxxRecordDecl(hasAnyName(
"::std::vector", "::std::set", "::std::unordered_set", "::std::map",
"::std::unordered_map", "::std::array", "::std::deque")));
}
+namespace {
+
AST_MATCHER(Expr, hasSideEffects) {
return Node.HasSideEffects(Finder->getASTContext());
}
diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
index 570a109..0237c05 100644
--- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
@@ -64,6 +64,8 @@ private:
const SourceManager &SM;
};
+} // namespace
+
void DuplicateIncludeCallbacks::FileChanged(SourceLocation Loc,
FileChangeReason Reason,
SrcMgr::CharacteristicKind FileType,
@@ -107,8 +109,6 @@ void DuplicateIncludeCallbacks::MacroUndefined(const Token &MacroNameTok,
Files.back().clear();
}
-} // namespace
-
void DuplicateIncludeCheck::registerPPCallbacks(
const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
PP->addPPCallbacks(std::make_unique<DuplicateIncludeCallbacks>(*this, SM));
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index bfdf9cb..6f6da57 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -51,7 +51,7 @@ static StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind,
return Type->isUnsignedIntegerType() ? "0u" : "0";
case CK_FloatingToBoolean:
- return Context.hasSameType(Type, Context.FloatTy) ? "0.0f" : "0.0";
+ return ASTContext::hasSameType(Type, Context.FloatTy) ? "0.0f" : "0.0";
case CK_PointerToBoolean:
case CK_MemberPointerToBoolean: // Fall-through on purpose.
@@ -215,7 +215,7 @@ getEquivalentForBoolLiteral(const CXXBoolLiteralExpr *BoolLiteral,
}
if (DestType->isFloatingType()) {
- if (Context.hasSameType(DestType, Context.FloatTy)) {
+ if (ASTContext::hasSameType(DestType, Context.FloatTy)) {
return BoolLiteral->getValue() ? "1.0f" : "0.0f";
}
return BoolLiteral->getValue() ? "1.0" : "0.0";
diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
index 2eb26fc..93580a7 100644
--- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
@@ -54,9 +54,12 @@ struct InconsistentDeclarationInfo {
using InconsistentDeclarationsContainer =
llvm::SmallVector<InconsistentDeclarationInfo, 2>;
-bool checkIfFixItHintIsApplicable(
- const FunctionDecl *ParameterSourceDeclaration,
- const ParmVarDecl *SourceParam, const FunctionDecl *OriginalDeclaration) {
+} // namespace
+
+static bool
+checkIfFixItHintIsApplicable(const FunctionDecl *ParameterSourceDeclaration,
+ const ParmVarDecl *SourceParam,
+ const FunctionDecl *OriginalDeclaration) {
// Assumptions with regard to function declarations/definition:
// * If both function declaration and definition are seen, assume that
// definition is most up-to-date, and use it to generate replacements.
@@ -83,7 +86,7 @@ bool checkIfFixItHintIsApplicable(
return true;
}
-bool nameMatch(StringRef L, StringRef R, bool Strict) {
+static bool nameMatch(StringRef L, StringRef R, bool Strict) {
if (Strict)
return L.empty() || R.empty() || L == R;
// We allow two names if one is a prefix/suffix of the other, ignoring case.
@@ -92,7 +95,7 @@ bool nameMatch(StringRef L, StringRef R, bool Strict) {
L.ends_with_insensitive(R) || R.ends_with_insensitive(L);
}
-DifferingParamsContainer
+static DifferingParamsContainer
findDifferingParamsInDeclaration(const FunctionDecl *ParameterSourceDeclaration,
const FunctionDecl *OtherDeclaration,
const FunctionDecl *OriginalDeclaration,
@@ -129,7 +132,7 @@ findDifferingParamsInDeclaration(const FunctionDecl *ParameterSourceDeclaration,
return DifferingParams;
}
-InconsistentDeclarationsContainer
+static InconsistentDeclarationsContainer
findInconsistentDeclarations(const FunctionDecl *OriginalDeclaration,
const FunctionDecl *ParameterSourceDeclaration,
SourceManager &SM, bool Strict) {
@@ -162,7 +165,7 @@ findInconsistentDeclarations(const FunctionDecl *OriginalDeclaration,
return InconsistentDeclarations;
}
-const FunctionDecl *
+static const FunctionDecl *
getParameterSourceDeclaration(const FunctionDecl *OriginalDeclaration) {
const FunctionTemplateDecl *PrimaryTemplate =
OriginalDeclaration->getPrimaryTemplate();
@@ -187,7 +190,7 @@ getParameterSourceDeclaration(const FunctionDecl *OriginalDeclaration) {
return OriginalDeclaration;
}
-std::string joinParameterNames(
+static std::string joinParameterNames(
const DifferingParamsContainer &DifferingParams,
llvm::function_ref<StringRef(const DifferingParamInfo &)> ChooseParamName) {
llvm::SmallString<40> Str;
@@ -202,7 +205,7 @@ std::string joinParameterNames(
return std::string(Str);
}
-void formatDifferingParamsDiagnostic(
+static void formatDifferingParamsDiagnostic(
InconsistentDeclarationParameterNameCheck *Check, SourceLocation Location,
StringRef OtherDeclarationDescription,
const DifferingParamsContainer &DifferingParams) {
@@ -230,7 +233,7 @@ void formatDifferingParamsDiagnostic(
}
}
-void formatDiagnosticsForDeclarations(
+static void formatDiagnosticsForDeclarations(
InconsistentDeclarationParameterNameCheck *Check,
const FunctionDecl *ParameterSourceDeclaration,
const FunctionDecl *OriginalDeclaration,
@@ -256,7 +259,7 @@ void formatDiagnosticsForDeclarations(
}
}
-void formatDiagnostics(
+static void formatDiagnostics(
InconsistentDeclarationParameterNameCheck *Check,
const FunctionDecl *ParameterSourceDeclaration,
const FunctionDecl *OriginalDeclaration,
@@ -279,8 +282,6 @@ void formatDiagnostics(
}
}
-} // anonymous namespace
-
void InconsistentDeclarationParameterNameCheck::storeOptions(
ClangTidyOptions::OptionMap &Opts) {
Options.store(Opts, "IgnoreMacros", IgnoreMacros);
diff --git a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
index dc9510d..942a0a8 100644
--- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
@@ -142,12 +142,11 @@ void QualifiedAutoCheck::registerMatchers(MatchFinder *Finder) {
if (this->IgnoreAliasing) {
return qualType(
hasUnqualifiedDesugaredType(pointerType(pointee(InnerMatchers...))));
- } else {
- return qualType(
- anyOf(qualType(pointerType(pointee(InnerMatchers...))),
- qualType(substTemplateTypeParmType(hasReplacementType(
- pointerType(pointee(InnerMatchers...)))))));
}
+ return qualType(anyOf(qualType(pointerType(pointee(InnerMatchers...))),
+ qualType(substTemplateTypeParmType(hasReplacementType(
+ pointerType(pointee(InnerMatchers...)))))));
+
};
auto IsAutoDeducedToPointer =
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
index 0598683..107291d 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
@@ -14,8 +14,8 @@ using namespace clang::ast_matchers;
namespace clang::tidy::readability {
-namespace {
-internal::Matcher<Expr> callToGet(const internal::Matcher<Decl> &OnClass) {
+static internal::Matcher<Expr>
+callToGet(const internal::Matcher<Decl> &OnClass) {
return expr(
anyOf(cxxMemberCallExpr(
on(expr(anyOf(hasType(OnClass),
@@ -43,12 +43,13 @@ internal::Matcher<Expr> callToGet(const internal::Matcher<Decl> &OnClass) {
.bind("redundant_get");
}
-internal::Matcher<Decl> knownSmartptr() {
+static internal::Matcher<Decl> knownSmartptr() {
return recordDecl(hasAnyName("::std::unique_ptr", "::std::shared_ptr"));
}
-void registerMatchersForGetArrowStart(MatchFinder *Finder,
- MatchFinder::MatchCallback *Callback) {
+static void
+registerMatchersForGetArrowStart(MatchFinder *Finder,
+ MatchFinder::MatchCallback *Callback) {
const auto MatchesOpArrow =
allOf(hasName("operator->"),
returns(qualType(pointsTo(type().bind("op->Type")))));
@@ -100,8 +101,8 @@ void registerMatchersForGetArrowStart(MatchFinder *Finder,
Callback);
}
-void registerMatchersForGetEquals(MatchFinder *Finder,
- MatchFinder::MatchCallback *Callback) {
+static void registerMatchersForGetEquals(MatchFinder *Finder,
+ MatchFinder::MatchCallback *Callback) {
// This one is harder to do with duck typing.
// The operator==/!= that we are looking for might be member or non-member,
// might be on global namespace or found by ADL, might be a template, etc.
@@ -118,8 +119,6 @@ void registerMatchersForGetEquals(MatchFinder *Finder,
// FIXME: Match and fix if (l.get() == r.get()).
}
-} // namespace
-
void RedundantSmartptrGetCheck::storeOptions(
ClangTidyOptions::OptionMap &Opts) {
Options.store(Opts, "IgnoreMacros", IgnoreMacros);
diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
index d1738f1..feb248d 100644
--- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
@@ -288,8 +288,8 @@ static bool applyDiceHeuristic(StringRef Arg, StringRef Param,
std::size_t Intersection = 0;
// Find the intersection between the two sets.
- for (auto IT = ParamBigrams.begin(); IT != ParamBigrams.end(); ++IT)
- Intersection += ArgBigrams.count((IT->getKey()));
+ for (const auto &[Key, _] : ParamBigrams)
+ Intersection += ArgBigrams.count(Key);
// Calculate Dice coefficient.
return percentage(Intersection * 2.0,
diff --git a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
index c1dc209..740a68d 100644
--- a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
@@ -55,8 +55,10 @@ struct NewSuffix {
std::optional<FixItHint> FixIt;
};
-std::optional<SourceLocation> getMacroAwareLocation(SourceLocation Loc,
- const SourceManager &SM) {
+} // namespace
+
+static std::optional<SourceLocation>
+getMacroAwareLocation(SourceLocation Loc, const SourceManager &SM) {
// Do nothing if the provided location is invalid.
if (Loc.isInvalid())
return std::nullopt;
@@ -67,8 +69,8 @@ std::optional<SourceLocation> getMacroAwareLocation(SourceLocation Loc,
return SpellingLoc;
}
-std::optional<SourceRange> getMacroAwareSourceRange(SourceRange Loc,
- const SourceManager &SM) {
+static std::optional<SourceRange>
+getMacroAwareSourceRange(SourceRange Loc, const SourceManager &SM) {
std::optional<SourceLocation> Begin =
getMacroAwareLocation(Loc.getBegin(), SM);
std::optional<SourceLocation> End = getMacroAwareLocation(Loc.getEnd(), SM);
@@ -77,7 +79,7 @@ std::optional<SourceRange> getMacroAwareSourceRange(SourceRange Loc,
return SourceRange(*Begin, *End);
}
-std::optional<std::string>
+static std::optional<std::string>
getNewSuffix(llvm::StringRef OldSuffix,
const std::vector<StringRef> &NewSuffixes) {
// If there is no config, just uppercase the entirety of the suffix.
@@ -96,7 +98,7 @@ getNewSuffix(llvm::StringRef OldSuffix,
}
template <typename LiteralType>
-std::optional<NewSuffix>
+static std::optional<NewSuffix>
shouldReplaceLiteralSuffix(const Expr &Literal,
const std::vector<StringRef> &NewSuffixes,
const SourceManager &SM, const LangOptions &LO) {
@@ -174,8 +176,6 @@ shouldReplaceLiteralSuffix(const Expr &Literal,
return ReplacementDsc;
}
-} // namespace
-
UppercaseLiteralSuffixCheck::UppercaseLiteralSuffixCheck(
StringRef Name, ClangTidyContext *Context)
: ClangTidyCheck(Name, Context),
diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
index 57453ad..a5b0883 100644
--- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
@@ -19,9 +19,8 @@ namespace clang::tidy::utils::decl_ref_expr {
using namespace ::clang::ast_matchers;
using llvm::SmallPtrSet;
-namespace {
-
-template <typename S> bool isSetDifferenceEmpty(const S &S1, const S &S2) {
+template <typename S>
+static bool isSetDifferenceEmpty(const S &S1, const S &S2) {
for (auto E : S1)
if (S2.count(E) == 0)
return false;
@@ -30,15 +29,15 @@ template <typename S> bool isSetDifferenceEmpty(const S &S1, const S &S2) {
// Extracts all Nodes keyed by ID from Matches and inserts them into Nodes.
template <typename Node>
-void extractNodesByIdTo(ArrayRef<BoundNodes> Matches, StringRef ID,
- SmallPtrSet<const Node *, 16> &Nodes) {
+static void extractNodesByIdTo(ArrayRef<BoundNodes> Matches, StringRef ID,
+ SmallPtrSet<const Node *, 16> &Nodes) {
for (const auto &Match : Matches)
Nodes.insert(Match.getNodeAs<Node>(ID));
}
// Returns true if both types refer to the same type,
// ignoring the const-qualifier.
-bool isSameTypeIgnoringConst(QualType A, QualType B) {
+static bool isSameTypeIgnoringConst(QualType A, QualType B) {
A = A.getCanonicalType();
B = B.getCanonicalType();
A.addConst();
@@ -47,7 +46,8 @@ bool isSameTypeIgnoringConst(QualType A, QualType B) {
}
// Returns true if `D` and `O` have the same parameter types.
-bool hasSameParameterTypes(const CXXMethodDecl &D, const CXXMethodDecl &O) {
+static bool hasSameParameterTypes(const CXXMethodDecl &D,
+ const CXXMethodDecl &O) {
if (D.getNumParams() != O.getNumParams())
return false;
for (int I = 0, E = D.getNumParams(); I < E; ++I) {
@@ -60,7 +60,7 @@ bool hasSameParameterTypes(const CXXMethodDecl &D, const CXXMethodDecl &O) {
// If `D` has a const-qualified overload with otherwise identical
// ref-qualifiers and parameter types, returns that overload.
-const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) {
+static const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) {
assert(!D.isConst());
DeclContext::lookup_result LookupResult =
@@ -81,7 +81,7 @@ const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) {
// Returns true if both types are pointers or reference to the same type,
// ignoring the const-qualifier.
-bool pointsToSameTypeIgnoringConst(QualType A, QualType B) {
+static bool pointsToSameTypeIgnoringConst(QualType A, QualType B) {
assert(A->isPointerType() || A->isReferenceType());
assert(B->isPointerType() || B->isReferenceType());
return isSameTypeIgnoringConst(A->getPointeeType(), B->getPointeeType());
@@ -122,7 +122,7 @@ bool pointsToSameTypeIgnoringConst(QualType A, QualType B) {
//
// This function checks (A) ad (B), but the caller should make sure that the
// object is not mutated through the return value.
-bool isLikelyShallowConst(const CXXMethodDecl &M) {
+static bool isLikelyShallowConst(const CXXMethodDecl &M) {
assert(!M.isConst());
// The method can mutate our variable.
@@ -146,6 +146,8 @@ bool isLikelyShallowConst(const CXXMethodDecl &M) {
return isSameTypeIgnoringConst(CallTy, OverloadTy);
}
+namespace {
+
// A matcher that matches DeclRefExprs that are used in ways such that the
// underlying declaration is not modified.
// If the declaration is of pointer type, `Indirections` specifies the level
diff --git a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
index 044f89b..b068ae2 100644
--- a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
@@ -19,8 +19,6 @@
namespace clang::tidy::utils {
-namespace {
-
/// Returns true if Name is reserved, like _Foo or __Vector_base.
static inline bool isReservedName(llvm::StringRef Name) {
// This doesn't catch all cases, but the most common.
@@ -28,6 +26,8 @@ static inline bool isReservedName(llvm::StringRef Name) {
(isUppercase(Name[1]) || Name[1] == '_');
}
+namespace {
+
// Helper class to iterate over the designator names of an aggregate type.
//
// For an array type, yields [0], [1], [2]...
@@ -112,6 +112,8 @@ private:
RecordDecl::field_iterator FieldsEnd;
};
+} // namespace
+
// Collect designator labels describing the elements of an init list.
//
// This function contributes the designators of some (sub)object, which is
@@ -127,10 +129,9 @@ private:
// '.a:' is produced directly without recursing into the written sublist.
// (The written sublist will have a separate collectDesignators() call later).
// Recursion with Prefix='.b' and Sem = {3, ImplicitValue} produces '.b.x:'.
-void collectDesignators(const InitListExpr *Sem,
- llvm::DenseMap<SourceLocation, std::string> &Out,
- const llvm::DenseSet<SourceLocation> &NestedBraces,
- std::string &Prefix) {
+static void collectDesignators(
+ const InitListExpr *Sem, llvm::DenseMap<SourceLocation, std::string> &Out,
+ const llvm::DenseSet<SourceLocation> &NestedBraces, std::string &Prefix) {
if (!Sem || Sem->isTransparent())
return;
assert(Sem->isSemanticForm());
@@ -170,8 +171,6 @@ void collectDesignators(const InitListExpr *Sem,
}
}
-} // namespace
-
llvm::DenseMap<SourceLocation, std::string>
getUnwrittenDesignators(const InitListExpr *Syn) {
assert(Syn->isSyntacticForm());
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index b746df5..570cab2 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -245,7 +245,7 @@ the configuration (without a prefix: ``Auto``).
.. note::
This currently only applies to braced initializer lists (when
- ``Cpp11BracedListStyle`` is ``true``) and parentheses.
+ ``Cpp11BracedListStyle`` is not ``Block``) and parentheses.
@@ -3816,29 +3816,72 @@ the configuration (without a prefix: ``Auto``).
.. _Cpp11BracedListStyle:
-**Cpp11BracedListStyle** (``Boolean``) :versionbadge:`clang-format 3.4` :ref:`¶ <Cpp11BracedListStyle>`
- If ``true``, format braced lists as best suited for C++11 braced
- lists.
+**Cpp11BracedListStyle** (``BracedListStyle``) :versionbadge:`clang-format 3.4` :ref:`¶ <Cpp11BracedListStyle>`
+ The style to handle braced lists.
- Important differences:
+ Possible values:
- * No spaces inside the braced list.
- * No line break before the closing brace.
- * Indentation with the continuation indent, not with the block indent.
+ * ``BLS_Block`` (in configuration: ``Block``)
+ Best suited for pre C++11 braced lists.
- Fundamentally, C++11 braced lists are formatted exactly like function
- calls would be formatted in their place. If the braced list follows a name
- (e.g. a type or variable name), clang-format formats as if the ``{}`` were
- the parentheses of a function call with that name. If there is no name,
- a zero-length name is assumed.
+ * Spaces inside the braced list.
+ * Line break before the closing brace.
+ * Indentation with the block indent.
+
+
+ .. code-block:: c++
+
+ vector<int> x{ 1, 2, 3, 4 };
+ vector<T> x{ {}, {}, {}, {} };
+ f(MyMap[{ composite, key }]);
+ new int[3]{ 1, 2, 3 };
+ Type name{ // Comment
+ value
+ };
+
+ * ``BLS_FunctionCall`` (in configuration: ``FunctionCall``)
+ Best suited for C++11 braced lists.
+
+ * No spaces inside the braced list.
+ * No line break before the closing brace.
+ * Indentation with the continuation indent.
+
+ Fundamentally, C++11 braced lists are formatted exactly like function
+ calls would be formatted in their place. If the braced list follows a
+ name (e.g. a type or variable name), clang-format formats as if the
+ ``{}`` were the parentheses of a function call with that name. If there
+ is no name, a zero-length name is assumed.
+
+ .. code-block:: c++
+
+ vector<int> x{1, 2, 3, 4};
+ vector<T> x{{}, {}, {}, {}};
+ f(MyMap[{composite, key}]);
+ new int[3]{1, 2, 3};
+ Type name{ // Comment
+ value};
+
+ * ``BLS_AlignFirstComment`` (in configuration: ``AlignFirstComment``)
+ Same as ``FunctionCall``, except for the handling of a comment at the
+ begin, it then aligns everything following with the comment.
+
+ * No spaces inside the braced list. (Even for a comment at the first
+ position.)
+ * No line break before the closing brace.
+ * Indentation with the continuation indent, except when followed by a
+ line comment, then it uses the block indent.
+
+
+ .. code-block:: c++
+
+ vector<int> x{1, 2, 3, 4};
+ vector<T> x{{}, {}, {}, {}};
+ f(MyMap[{composite, key}]);
+ new int[3]{1, 2, 3};
+ Type name{// Comment
+ value};
- .. code-block:: c++
- true: false:
- vector<int> x{1, 2, 3, 4}; vs. vector<int> x{ 1, 2, 3, 4 };
- vector<T> x{{}, {}, {}, {}}; vector<T> x{ {}, {}, {}, {} };
- f(MyMap[{composite, key}]); f(MyMap[{ composite, key }]);
- new int[3]{1, 2, 3}; new int[3]{ 1, 2, 3 };
.. _DeriveLineEnding:
@@ -6625,7 +6668,7 @@ the configuration (without a prefix: ``Auto``).
.. note::
This option doesn't apply to initializer braces if
- ``Cpp11BracedListStyle`` is set to ``true``.
+ ``Cpp11BracedListStyle`` is not ``Block``.
Possible values:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 62c70fba..d03c778 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -124,13 +124,13 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
}
let Features = "ssse3" in {
- def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}
let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+ def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
}
@@ -608,7 +608,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
- def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -661,6 +660,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def psrawi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
def psradi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
+ def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -1386,13 +1386,10 @@ let Features = "avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth
def vpshufbitqmb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
}
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def pmulhrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
-}
-
let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def pavgb512 : X86Builtin<"_Vector<64, unsigned char>(_Vector<64, unsigned char>, _Vector<64, unsigned char>)">;
def pavgw512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
+ def pmulhrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
def pmulhuw512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
def pmulhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
}
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 3df5b92..2852c4a 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -94,7 +94,7 @@ struct FormatStyle {
///
/// \note
/// This currently only applies to braced initializer lists (when
- /// ``Cpp11BracedListStyle`` is ``true``) and parentheses.
+ /// ``Cpp11BracedListStyle`` is not ``Block``) and parentheses.
/// \endnote
BAS_BlockIndent,
};
@@ -2555,29 +2555,67 @@ struct FormatStyle {
/// \version 3.7
unsigned ContinuationIndentWidth;
- /// If ``true``, format braced lists as best suited for C++11 braced
- /// lists.
- ///
- /// Important differences:
- ///
- /// * No spaces inside the braced list.
- /// * No line break before the closing brace.
- /// * Indentation with the continuation indent, not with the block indent.
- ///
- /// Fundamentally, C++11 braced lists are formatted exactly like function
- /// calls would be formatted in their place. If the braced list follows a name
- /// (e.g. a type or variable name), clang-format formats as if the ``{}`` were
- /// the parentheses of a function call with that name. If there is no name,
- /// a zero-length name is assumed.
- /// \code
- /// true: false:
- /// vector<int> x{1, 2, 3, 4}; vs. vector<int> x{ 1, 2, 3, 4 };
- /// vector<T> x{{}, {}, {}, {}}; vector<T> x{ {}, {}, {}, {} };
- /// f(MyMap[{composite, key}]); f(MyMap[{ composite, key }]);
- /// new int[3]{1, 2, 3}; new int[3]{ 1, 2, 3 };
- /// \endcode
+ /// Different ways to handle braced lists.
+ enum BracedListStyle : int8_t {
+ /// Best suited for pre C++11 braced lists.
+ ///
+ /// * Spaces inside the braced list.
+ /// * Line break before the closing brace.
+ /// * Indentation with the block indent.
+ ///
+ /// \code
+ /// vector<int> x{ 1, 2, 3, 4 };
+ /// vector<T> x{ {}, {}, {}, {} };
+ /// f(MyMap[{ composite, key }]);
+ /// new int[3]{ 1, 2, 3 };
+ /// Type name{ // Comment
+ /// value
+ /// };
+ /// \endcode
+ BLS_Block,
+ /// Best suited for C++11 braced lists.
+ ///
+ /// * No spaces inside the braced list.
+ /// * No line break before the closing brace.
+ /// * Indentation with the continuation indent.
+ ///
+ /// Fundamentally, C++11 braced lists are formatted exactly like function
+ /// calls would be formatted in their place. If the braced list follows a
+ /// name (e.g. a type or variable name), clang-format formats as if the
+ /// ``{}`` were the parentheses of a function call with that name. If there
+ /// is no name, a zero-length name is assumed.
+ /// \code
+ /// vector<int> x{1, 2, 3, 4};
+ /// vector<T> x{{}, {}, {}, {}};
+ /// f(MyMap[{composite, key}]);
+ /// new int[3]{1, 2, 3};
+ /// Type name{ // Comment
+ /// value};
+ /// \endcode
+ BLS_FunctionCall,
+ /// Same as ``FunctionCall``, except for the handling of a comment at the
+ /// begin, it then aligns everything following with the comment.
+ ///
+ /// * No spaces inside the braced list. (Even for a comment at the first
+ /// position.)
+ /// * No line break before the closing brace.
+ /// * Indentation with the continuation indent, except when followed by a
+ /// line comment, then it uses the block indent.
+ ///
+ /// \code
+ /// vector<int> x{1, 2, 3, 4};
+ /// vector<T> x{{}, {}, {}, {}};
+ /// f(MyMap[{composite, key}]);
+ /// new int[3]{1, 2, 3};
+ /// Type name{// Comment
+ /// value};
+ /// \endcode
+ BLS_AlignFirstComment,
+ };
+
+ /// The style to handle braced lists.
/// \version 3.4
- bool Cpp11BracedListStyle;
+ BracedListStyle Cpp11BracedListStyle;
/// This option is **deprecated**. See ``DeriveLF`` and ``DeriveCRLF`` of
/// ``LineEnding``.
@@ -4933,7 +4971,7 @@ struct FormatStyle {
/// Specifies when to insert a space in empty braces.
/// \note
/// This option doesn't apply to initializer braces if
- /// ``Cpp11BracedListStyle`` is set to ``true``.
+ /// ``Cpp11BracedListStyle`` is not ``Block``.
/// \endnote
/// \version 22
SpaceInEmptyBracesStyle SpaceInEmptyBraces;
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
index c233ca1..4aee165 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
@@ -211,6 +211,16 @@ protected:
getExtraInvalidatedValues(ValueList &Values,
RegionAndSymbolInvalidationTraits *ETraits) const {}
+ /// A state for looking up relevant Environment entries (arguments, return
+ /// value), dynamic type information and similar "stable" things.
+ /// WARNING: During the evaluation of a function call, several state
+ /// transitions happen, so this state can become partially obsolete!
+ ///
+ /// TODO: Instead of storing a complete state object in the CallEvent, only
+ /// store the relevant parts (such as argument/return SVals etc.) that aren't
+ /// allowed to become obsolete until the end of the call evaluation.
+ ProgramStateRef getState() const { return State; }
+
public:
CallEvent &operator=(const CallEvent &) = delete;
virtual ~CallEvent() = default;
@@ -231,8 +241,11 @@ public:
}
void setForeign(bool B) const { Foreign = B; }
- /// The state in which the call is being evaluated.
- const ProgramStateRef &getState() const { return State; }
+ /// NOTE: There are plans for refactoring that would eliminate this method.
+ /// Prefer to use CheckerContext::getASTContext if possible!
+ const ASTContext &getASTContext() const {
+ return getState()->getStateManager().getContext();
+ }
/// The context in which the call is being evaluated.
const LocationContext *getLocationContext() const { return LCtx; }
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5838cf8..0cb4910 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3621,6 +3621,15 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
});
+ case clang::X86::BI__builtin_ia32_pmulhrsw128:
+ case clang::X86::BI__builtin_ia32_pmulhrsw256:
+ case clang::X86::BI__builtin_ia32_pmulhrsw512:
+ return interp__builtin_elementwise_int_binop(
+ S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {
+ return (llvm::APIntOps::mulsExtended(LHS, RHS).ashr(14) + 1)
+ .extractBits(16, 1);
+ });
+
case clang::X86::BI__builtin_ia32_pavgb128:
case clang::X86::BI__builtin_ia32_pavgw128:
case clang::X86::BI__builtin_ia32_pavgb256:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 16141b2..e308c17 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11819,6 +11819,14 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case clang::X86::BI__builtin_ia32_pavgw512:
return EvaluateBinOpExpr(llvm::APIntOps::avgCeilU);
+ case clang::X86::BI__builtin_ia32_pmulhrsw128:
+ case clang::X86::BI__builtin_ia32_pmulhrsw256:
+ case clang::X86::BI__builtin_ia32_pmulhrsw512:
+ return EvaluateBinOpExpr([](const APSInt &LHS, const APSInt &RHS) {
+ return (llvm::APIntOps::mulsExtended(LHS, RHS).ashr(14) + 1)
+ .extractBits(16, 1);
+ });
+
case clang::X86::BI__builtin_ia32_pmaddubsw128:
case clang::X86::BI__builtin_ia32_pmaddubsw256:
case clang::X86::BI__builtin_ia32_pmaddubsw512:
diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp
index 29db200..994a427 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -306,8 +306,10 @@ BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators(
// In Verilog, all strings are quoted by double quotes, joined by commas,
// and wrapped in braces. The comma is always before the newline.
assert(QuoteStyle == DoubleQuotes);
- LeftBraceQuote = Style.Cpp11BracedListStyle ? "{\"" : "{ \"";
- RightBraceQuote = Style.Cpp11BracedListStyle ? "\"}" : "\" }";
+ LeftBraceQuote =
+ Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? "{\"" : "{ \"";
+ RightBraceQuote =
+ Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? "\"}" : "\" }";
Postfix = "\",";
Prefix = "\"";
} else {
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index b7d8569..26a9542 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -833,7 +833,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
auto IsOpeningBracket = [&](const FormatToken &Tok) {
auto IsStartOfBracedList = [&]() {
return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) &&
- Style.Cpp11BracedListStyle;
+ Style.Cpp11BracedListStyle != FormatStyle::BLS_Block;
};
if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) &&
!IsStartOfBracedList()) {
@@ -925,7 +925,12 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
TT_TableGenDAGArgOpenerToBreak) &&
!(Current.MacroParent && Previous.MacroParent) &&
(Current.isNot(TT_LineComment) ||
- Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen)) &&
+ (Previous.is(BK_BracedInit) &&
+ (Style.Cpp11BracedListStyle != FormatStyle::BLS_FunctionCall ||
+ !Previous.Previous ||
+ Previous.Previous->isNoneOf(tok::identifier, tok::l_paren,
+ BK_BracedInit))) ||
+ Previous.is(TT_VerilogMultiLineListLParen)) &&
!IsInTemplateString(Current)) {
CurrentState.Indent = State.Column + Spaces;
CurrentState.IsAligned = true;
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 093e88f..edd126c 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -304,6 +304,18 @@ struct ScalarEnumerationTraits<FormatStyle::BreakTemplateDeclarationsStyle> {
}
};
+template <> struct ScalarEnumerationTraits<FormatStyle::BracedListStyle> {
+ static void enumeration(IO &IO, FormatStyle::BracedListStyle &Value) {
+ IO.enumCase(Value, "Block", FormatStyle::BLS_Block);
+ IO.enumCase(Value, "FunctionCall", FormatStyle::BLS_FunctionCall);
+ IO.enumCase(Value, "AlignFirstComment", FormatStyle::BLS_AlignFirstComment);
+
+ // For backward compatibility.
+ IO.enumCase(Value, "false", FormatStyle::BLS_Block);
+ IO.enumCase(Value, "true", FormatStyle::BLS_AlignFirstComment);
+ }
+};
+
template <> struct ScalarEnumerationTraits<FormatStyle::DAGArgStyle> {
static void enumeration(IO &IO, FormatStyle::DAGArgStyle &Value) {
IO.enumCase(Value, "DontBreak", FormatStyle::DAS_DontBreak);
@@ -1628,7 +1640,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
LLVMStyle.CompactNamespaces = false;
LLVMStyle.ConstructorInitializerIndentWidth = 4;
LLVMStyle.ContinuationIndentWidth = 4;
- LLVMStyle.Cpp11BracedListStyle = true;
+ LLVMStyle.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment;
LLVMStyle.DerivePointerAlignment = false;
LLVMStyle.DisableFormat = false;
LLVMStyle.EmptyLineAfterAccessModifier = FormatStyle::ELAAMS_Never;
@@ -1904,7 +1916,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
// beneficial there. Investigate turning this on once proper string reflow
// has been implemented.
GoogleStyle.BreakStringLiterals = false;
- GoogleStyle.Cpp11BracedListStyle = false;
+ GoogleStyle.Cpp11BracedListStyle = FormatStyle::BLS_Block;
GoogleStyle.SpacesInContainerLiterals = false;
} else if (Language == FormatStyle::LK_ObjC) {
GoogleStyle.AlwaysBreakBeforeMultilineStrings = false;
@@ -2000,7 +2012,7 @@ FormatStyle getMozillaStyle() {
MozillaStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
MozillaStyle.ConstructorInitializerIndentWidth = 2;
MozillaStyle.ContinuationIndentWidth = 2;
- MozillaStyle.Cpp11BracedListStyle = false;
+ MozillaStyle.Cpp11BracedListStyle = FormatStyle::BLS_Block;
MozillaStyle.FixNamespaceComments = false;
MozillaStyle.IndentCaseLabels = true;
MozillaStyle.ObjCSpaceAfterProperty = true;
@@ -2023,7 +2035,7 @@ FormatStyle getWebKitStyle() {
Style.BreakBeforeBraces = FormatStyle::BS_WebKit;
Style.BreakConstructorInitializers = FormatStyle::BCIS_BeforeComma;
Style.ColumnLimit = 0;
- Style.Cpp11BracedListStyle = false;
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
Style.FixNamespaceComments = false;
Style.IndentWidth = 4;
Style.NamespaceIndentation = FormatStyle::NI_Inner;
@@ -2043,7 +2055,7 @@ FormatStyle getGNUStyle() {
Style.BreakBeforeBraces = FormatStyle::BS_GNU;
Style.BreakBeforeTernaryOperators = true;
Style.ColumnLimit = 79;
- Style.Cpp11BracedListStyle = false;
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
Style.FixNamespaceComments = false;
Style.KeepFormFeed = true;
Style.SpaceBeforeParens = FormatStyle::SBPO_Always;
diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index cf02280..d1c6264 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -67,7 +67,7 @@ bool FormatToken::isBlockIndentedInitRBrace(const FormatStyle &Style) const {
assert(is(tok::r_brace));
assert(MatchingParen);
assert(MatchingParen->is(tok::l_brace));
- if (!Style.Cpp11BracedListStyle ||
+ if (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block ||
Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent) {
return false;
}
@@ -88,7 +88,8 @@ bool FormatToken::opensBlockOrBlockTypeList(const FormatStyle &Style) const {
return is(TT_ArrayInitializerLSquare) || is(TT_ProtoExtensionLSquare) ||
(is(tok::l_brace) &&
(getBlockKind() == BK_Block || is(TT_DictLiteral) ||
- (!Style.Cpp11BracedListStyle && NestingLevel == 0))) ||
+ (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block &&
+ NestingLevel == 0))) ||
(is(tok::less) && Style.isProto());
}
@@ -184,7 +185,8 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) {
// In C++11 braced list style, we should not format in columns unless they
// have many items (20 or more) or we allow bin-packing of function call
// arguments.
- if (Style.Cpp11BracedListStyle && !Style.BinPackArguments &&
+ if (Style.Cpp11BracedListStyle != FormatStyle::BLS_Block &&
+ !Style.BinPackArguments &&
(Commas.size() < 19 || !Style.BinPackLongBracedList)) {
return;
}
@@ -228,7 +230,7 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) {
ItemEnd = Token->MatchingParen;
const FormatToken *NonCommentEnd = ItemEnd->getPreviousNonComment();
ItemLengths.push_back(CodePointsBetween(ItemBegin, NonCommentEnd));
- if (Style.Cpp11BracedListStyle &&
+ if (Style.Cpp11BracedListStyle != FormatStyle::BLS_Block &&
!ItemEnd->Previous->isTrailingComment()) {
// In Cpp11 braced list style, the } and possibly other subsequent
// tokens will need to stay on a line with the last element.
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index ffbd383..778d2ca 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -4094,7 +4094,8 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
if (Current->is(TT_LineComment)) {
if (Prev->is(BK_BracedInit) && Prev->opensScope()) {
Current->SpacesRequiredBefore =
- (Style.Cpp11BracedListStyle && !Style.SpacesInParensOptions.Other)
+ (Style.Cpp11BracedListStyle == FormatStyle::BLS_AlignFirstComment &&
+ !Style.SpacesInParensOptions.Other)
? 0
: 1;
} else if (Prev->is(TT_VerilogMultiLineListLParen)) {
@@ -4445,8 +4446,10 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
(Left.ParameterCount <= 1 || Style.AllowAllArgumentsOnNextLine)) {
return 0;
}
- if (Left.is(tok::l_brace) && !Style.Cpp11BracedListStyle)
+ if (Left.is(tok::l_brace) &&
+ Style.Cpp11BracedListStyle == FormatStyle::BLS_Block) {
return 19;
+ }
return Left.ParameterCount > 1 ? Style.PenaltyBreakBeforeFirstCallParameter
: 19;
}
@@ -4612,7 +4615,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
// Format empty list as `<>`.
if (Left.is(tok::less) && Right.is(tok::greater))
return false;
- return !Style.Cpp11BracedListStyle;
+ return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block;
}
// Don't attempt to format operator<(), as it is handled later.
if (Right.isNot(TT_OverloadedOperatorLParen))
@@ -4780,7 +4783,8 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
const auto SpaceRequiredForArrayInitializerLSquare =
[](const FormatToken &LSquareTok, const FormatStyle &Style) {
return Style.SpacesInContainerLiterals ||
- (Style.isProto() && !Style.Cpp11BracedListStyle &&
+ (Style.isProto() &&
+ Style.Cpp11BracedListStyle == FormatStyle::BLS_Block &&
LSquareTok.endsSequence(tok::l_square, tok::colon,
TT_SelectorName));
};
@@ -4813,7 +4817,8 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
if ((Left.is(tok::l_brace) && Left.isNot(BK_Block)) ||
(Right.is(tok::r_brace) && Right.MatchingParen &&
Right.MatchingParen->isNot(BK_Block))) {
- return !Style.Cpp11BracedListStyle || Style.SpacesInParensOptions.Other;
+ return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block ||
+ Style.SpacesInParensOptions.Other;
}
if (Left.is(TT_BlockComment)) {
// No whitespace in x(/*foo=*/1), except for JavaScript.
@@ -4995,7 +5000,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
Left.Children.empty()) {
if (Left.is(BK_Block))
return Style.SpaceInEmptyBraces != FormatStyle::SIEB_Never;
- if (Style.Cpp11BracedListStyle) {
+ if (Style.Cpp11BracedListStyle != FormatStyle::BLS_Block) {
return Style.SpacesInParens == FormatStyle::SIPO_Custom &&
Style.SpacesInParensOptions.InEmptyParentheses;
}
@@ -5077,7 +5082,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
if (Left.MatchingParen &&
Left.MatchingParen->is(TT_ProtoExtensionLSquare) &&
Right.isOneOf(tok::l_brace, tok::less)) {
- return !Style.Cpp11BracedListStyle;
+ return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block;
}
// A percent is probably part of a formatting specification, such as %lld.
if (Left.is(tok::percent))
@@ -5517,7 +5522,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
if (Left.is(tok::greater) && Right.is(tok::greater)) {
if (Style.isTextProto() ||
(Style.Language == FormatStyle::LK_Proto && Left.is(TT_DictLiteral))) {
- return !Style.Cpp11BracedListStyle;
+ return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block;
}
return Right.is(TT_TemplateCloser) && Left.is(TT_TemplateCloser) &&
((Style.Standard < FormatStyle::LS_Cpp11) ||
@@ -6378,7 +6383,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
return false;
}
if (Left.is(tok::equal) && Right.is(tok::l_brace) &&
- !Style.Cpp11BracedListStyle) {
+ Style.Cpp11BracedListStyle == FormatStyle::BLS_Block) {
return false;
}
if (Left.is(TT_AttributeLParen) ||
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 7348a3a..9261294 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -1238,7 +1238,8 @@ void WhitespaceManager::alignArrayInitializersRightJustified(
if (!CellDescs.isRectangular())
return;
- const int BracePadding = Style.Cpp11BracedListStyle ? 0 : 1;
+ const int BracePadding =
+ Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? 0 : 1;
auto &Cells = CellDescs.Cells;
// Now go through and fixup the spaces.
auto *CellIter = Cells.begin();
@@ -1314,7 +1315,8 @@ void WhitespaceManager::alignArrayInitializersLeftJustified(
if (!CellDescs.isRectangular())
return;
- const int BracePadding = Style.Cpp11BracedListStyle ? 0 : 1;
+ const int BracePadding =
+ Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? 0 : 1;
auto &Cells = CellDescs.Cells;
// Now go through and fixup the spaces.
auto *CellIter = Cells.begin();
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index fa7f4c2..d35bc0e 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1650,9 +1650,8 @@ _mm256_mul_epi32(__m256i __a, __m256i __b) {
/// \param __b
/// A 256-bit vector of [16 x i16] containing one of the source operands.
/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
}
@@ -1670,8 +1669,7 @@ _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector of [16 x i16] containing one of the source operands.
/// \returns A 256-bit vector of [16 x i16] containing the products.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_mulhi_epu16(__m256i __a, __m256i __b)
-{
+_mm256_mulhi_epu16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
}
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 23b2d29..ac75b6c 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -1003,23 +1003,20 @@ _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I,
(__v32hi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mulhrs_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mulhrs_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
(__v32hi)_mm512_mulhrs_epi16(__A, __B),
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
(__v32hi)_mm512_mulhrs_epi16(__A, __B),
(__v32hi)_mm512_setzero_si512());
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 639fb60..0fcfe37 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -1510,28 +1510,28 @@ _mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
__builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
(__v8hi)_mm_mulhrs_epi16(__X, __Y),
(__v8hi)__W);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
(__v8hi)_mm_mulhrs_epi16(__X, __Y),
(__v8hi)_mm_setzero_si128());
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
(__v16hi)_mm256_mulhrs_epi16(__X, __Y),
(__v16hi)__W);
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
(__v16hi)_mm256_mulhrs_epi16(__X, __Y),
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index ee96caa..5d0f20f 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -544,8 +544,8 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) {
/// A 128-bit vector of [8 x i16] containing one of the source operands.
/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
/// products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhrs_epi16(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mulhrs_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
}
@@ -563,11 +563,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhrs_epi16(__m128i __a,
/// A 64-bit vector of [4 x i16] containing one of the source operands.
/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
/// products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_mulhrs_pi16(__m64 __a, __m64 __b)
-{
- return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
- (__v8hi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mulhrs_pi16(__m64 __a, __m64 __b) {
+ return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__zext128(__a),
+ (__v8hi)__zext128(__b)));
}
/// Copies the 8-bit integers from a 128-bit integer vector to the
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 652527a..ef1be23 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -12309,13 +12309,20 @@ static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source,
SourceLocation CC) {
assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() &&
Source != Target);
+
+ // Lone surrogates have a distinct representation in UTF-32.
+ // Converting between UTF-16 and UTF-32 codepoints seems very widespread,
+ // so don't warn on such conversion.
+ if (Source->isChar16Type() && Target->isChar32Type())
+ return;
+
Expr::EvalResult Result;
if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects,
S.isConstantEvaluatedContext())) {
llvm::APSInt Value(32);
Value = Result.Val.getInt();
bool IsASCII = Value <= 0x7F;
- bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF);
+ bool IsBMP = Value <= 0xDFFF || (Value >= 0xE000 && Value <= 0xFFFF);
bool ConversionPreservesSemantics =
IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP);
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index ca7e3b2..038f396 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2864,9 +2864,9 @@ TemplateInstantiator::TransformNestedRequirement(
TemplateArgs, Constraint->getSourceRange(), Satisfaction,
/*TopLevelConceptId=*/nullptr, &NewConstraint);
- assert(!Success || !Trap.hasErrorOccurred() &&
- "Substitution failures must be handled "
- "by CheckConstraintSatisfaction.");
+ assert((!Success || !Trap.hasErrorOccurred()) &&
+ "Substitution failures must be handled "
+ "by CheckConstraintSatisfaction.");
}
if (!Success || Satisfaction.HasSubstitutionFailure())
diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
index bf35bee..3ddd659 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
@@ -104,7 +104,7 @@ class RAIIMutexDescriptor {
// this function is called instead of early returning it. To avoid this, a
// bool variable (IdentifierInfoInitialized) is used and the function will
// be run only once.
- const auto &ASTCtx = Call.getState()->getStateManager().getContext();
+ const auto &ASTCtx = Call.getASTContext();
Guard = &ASTCtx.Idents.get(GuardName);
}
}
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
index 9d3aeff..2420848 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
@@ -929,7 +929,7 @@ ObjCDeallocChecker::getValueReleasedByNillingOut(const ObjCMethodCall &M,
SVal Arg = M.getArgSVal(0);
ProgramStateRef notNilState, nilState;
std::tie(notNilState, nilState) =
- M.getState()->assume(Arg.castAs<DefinedOrUnknownSVal>());
+ C.getState()->assume(Arg.castAs<DefinedOrUnknownSVal>());
if (!(nilState && !notNilState))
return nullptr;
diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
index f984caf..227cbfa 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
@@ -34,7 +34,7 @@ class ObjCSuperDeallocChecker
this, "[super dealloc] should not be called more than once",
categories::CoreFoundationObjectiveC};
- void initIdentifierInfoAndSelectors(ASTContext &Ctx) const;
+ void initIdentifierInfoAndSelectors(const ASTContext &Ctx) const;
bool isSuperDeallocMessage(const ObjCMethodCall &M) const;
@@ -214,8 +214,8 @@ void ObjCSuperDeallocChecker::diagnoseCallArguments(const CallEvent &CE,
}
}
-void
-ObjCSuperDeallocChecker::initIdentifierInfoAndSelectors(ASTContext &Ctx) const {
+void ObjCSuperDeallocChecker::initIdentifierInfoAndSelectors(
+ const ASTContext &Ctx) const {
if (IIdealloc)
return;
@@ -230,7 +230,7 @@ ObjCSuperDeallocChecker::isSuperDeallocMessage(const ObjCMethodCall &M) const {
if (M.getOriginExpr()->getReceiverKind() != ObjCMessageExpr::SuperInstance)
return false;
- ASTContext &Ctx = M.getState()->getStateManager().getContext();
+ const ASTContext &Ctx = M.getASTContext();
initIdentifierInfoAndSelectors(Ctx);
return M.getSelector() == SELdealloc;
diff --git a/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp
index 4fc1c57..db8bbee 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp
@@ -211,13 +211,13 @@ private:
if (!DefaultType)
return;
- ProgramStateRef State = ConstructorCall->getState();
+ ProgramStateRef State = C.getState();
State = State->set<VariantHeldTypeMap>(ThisMemRegion, *DefaultType);
C.addTransition(State);
}
bool handleStdGetCall(const CallEvent &Call, CheckerContext &C) const {
- ProgramStateRef State = Call.getState();
+ ProgramStateRef State = C.getState();
const auto &ArgType = Call.getArgSVal(0)
.getType(C.getASTContext())
diff --git a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
index dec4612..b8fb572 100644
--- a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
+++ b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
@@ -52,7 +52,7 @@ removeInformationStoredForDeadInstances(const CallEvent &Call,
template <class TypeMap>
void handleConstructorAndAssignment(const CallEvent &Call, CheckerContext &C,
SVal ThisSVal) {
- ProgramStateRef State = Call.getState();
+ ProgramStateRef State = C.getState();
if (!State)
return;
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index b798618..a505d70 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1038,6 +1038,7 @@ __m256i test_mm256_mulhrs_epi16(__m256i a, __m256i b) {
// CHECK: call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_mulhrs_epi16(a, b);
}
+TEST_CONSTEXPR(match_v16hi(_mm256_mulhrs_epi16((__m256i)(__v16hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600}, (__m256i)(__v16hi){+1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +5, -9, -13, +16, +18, -20, -21, -22, -22, +21, +20, -18, -16, +13, +9, -5));
__m256i test_mm256_mullo_epi16(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_mullo_epi16
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index fddf17d..55bf482 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1596,18 +1596,24 @@ __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.pmul.hr.sw.512
return _mm512_mulhrs_epi16(__A,__B);
}
+TEST_CONSTEXPR(match_v32hi(_mm512_mulhrs_epi16((__m512i)(__v32hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600, -1700, -1800, +1900, +2000, -2100, -2200, +2300, +2400, -2500, -2600, +2700, +2800, -2900, -3000, +3100, +3200}, (__m512i)(__v32hi){+3200, -3100, +3000, -2900, +2800, -2700, +2600, -2500, +2400, -2300, +2200, -2100, +2000, -1900, +1800, -1700, +1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +10, -19, -27, +35, +43, -49, -56, -61, -66, +70, +74, -77, -79, +81, +82, -83, -83, +82, +81, -79, -77, +74, +70, -66, -61, +56, +49, -43, -35, +27, +19, -10));
+
__m512i test_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_mulhrs_epi16
// CHECK: @llvm.x86.avx512.pmul.hr.sw.512
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
return _mm512_mask_mulhrs_epi16(__W,__U,__A,__B);
}
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_mulhrs_epi16(_mm512_set1_epi16(1), 0x0000FFFF, (__m512i)(__v32hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600, -1700, -1800, +1900, +2000, -2100, -2200, +2300, +2400, -2500, -2600, +2700, +2800, -2900, -3000, +3100, +3200}, (__m512i)(__v32hi){+3200, -3100, +3000, -2900, +2800, -2700, +2600, -2500, +2400, -2300, +2200, -2100, +2000, -1900, +1800, -1700, +1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +10, -19, -27, +35, +43, -49, -56, -61, -66, +70, +74, -77, -79, +81, +82, -83, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1));
+
__m512i test_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_mulhrs_epi16
// CHECK: @llvm.x86.avx512.pmul.hr.sw.512
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
return _mm512_maskz_mulhrs_epi16(__U,__A,__B);
}
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_mulhrs_epi16(0x0000FFFF, (__m512i)(__v32hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600, -1700, -1800, +1900, +2000, -2100, -2200, +2300, +2400, -2500, -2600, +2700, +2800, -2900, -3000, +3100, +3200}, (__m512i)(__v32hi){+3200, -3100, +3000, -2900, +2800, -2700, +2600, -2500, +2400, -2300, +2200, -2100, +2000, -1900, +1800, -1700, +1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +10, -19, -27, +35, +43, -49, -56, -61, -66, +70, +74, -77, -79, +81, +82, -83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
+
__m512i test_mm512_mulhi_epi16(__m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mulhi_epi16
// CHECK: @llvm.x86.avx512.pmulh.w.512
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index d569283..95e4d40 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -2061,6 +2061,7 @@ __m128i test_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128
// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
return _mm_mask_mulhrs_epi16(__W, __U, __X, __Y);
}
+TEST_CONSTEXPR(match_v8hi(_mm_mask_mulhrs_epi16(_mm_set1_epi16(1), 0x0F, (__m128i)(__v8hi){+100, +200, -300, -400, +500, +600, -700, +800}, (__m128i)(__v8hi){+8000, -7000, +6000, -5000, +4000, -3000, +2000, -1000}), +24, -43, -55, +61, +1, +1, +1, +1));
__m128i test_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
// CHECK-LABEL: test_mm_maskz_mulhrs_epi16
@@ -2068,6 +2069,7 @@ __m128i test_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
return _mm_maskz_mulhrs_epi16(__U, __X, __Y);
}
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_mulhrs_epi16(0x0F, (__m128i)(__v8hi){+100, +200, -300, -400, +500, +600, -700, +800}, (__m128i)(__v8hi){+8000, -7000, +6000, -5000, +4000, -3000, +2000, -1000}), +24, -43, -55, +61, 0, 0, 0, 0));
__m256i test_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
// CHECK-LABEL: test_mm256_mask_mulhrs_epi16
@@ -2075,6 +2077,7 @@ __m256i test_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __
// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
return _mm256_mask_mulhrs_epi16(__W, __U, __X, __Y);
}
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_mulhrs_epi16(_mm256_set1_epi16(1), 0xF00F, (__m256i)(__v16hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600}, (__m256i)(__v16hi){+1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +5, -9, -13, +16, +1, +1, +1, +1, +1, +1, +1, +1, -16, +13, +9, -5));
__m256i test_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
// CHECK-LABEL: test_mm256_maskz_mulhrs_epi16
@@ -2082,6 +2085,7 @@ __m256i test_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
return _mm256_maskz_mulhrs_epi16(__U, __X, __Y);
}
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_mulhrs_epi16(0xF00F, (__m256i)(__v16hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600}, (__m256i)(__v16hi){+1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +5, -9, -13, +16, 0, 0, 0, 0, 0, 0, 0, 0, -16, +13, +9, -5));
__m128i test_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_mask_mulhi_epu16
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index d9041d4..c1ac57b 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -438,6 +438,7 @@ __m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) {
// CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(
return _mm_mulhrs_pi16(a, b);
}
+TEST_CONSTEXPR(match_v4hi(_mm_mulhrs_pi16((__m64)(__v4hi){+100, +200, -300, -400}, (__m64)(__v4hi){+30000, -20000, +10000, -5000}), +92, -122, -92, +61));
__m64 test_mm_mullo_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_mullo_pi16
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 32abd9d..f70afc0 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -110,6 +110,7 @@ __m128i test_mm_mulhrs_epi16(__m128i a, __m128i b) {
// CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_mulhrs_epi16(a, b);
}
+TEST_CONSTEXPR(match_v8hi(_mm_mulhrs_epi16((__m128i)(__v8hi){+100, +200, -300, -400, +500, +600, -700, +800}, (__m128i)(__v8hi){+8000, -7000, +6000, -5000, +4000, -3000, +2000, -1000}), +24, -43, -55, +61, +61, -55, -43, -24));
__m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
// CHECK-LABEL: test_mm_shuffle_epi8
diff --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
index fcff006..f17f20c 100644
--- a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
+++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
@@ -14,7 +14,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
c16(u32); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' may lose precision and change the meaning of the represented code unit}}
c32(u8); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' may change the meaning of the represented code unit}}
- c32(u16); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' may change the meaning of the represented code unit}}
+ c32(u16);
c32(u32);
@@ -30,7 +30,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
c16(char32_t(0x7f));
c16(char32_t(0x80));
c16(char32_t(0xD7FF));
- c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}}
+ c16(char32_t(0xD800));
c16(char32_t(0xE000));
c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code point '🐉'}}
@@ -44,8 +44,8 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
c32(char16_t(0x80));
c32(char16_t(0xD7FF));
- c32(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xD800>'}}
- c32(char16_t(0xDFFF)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xDFFF>'}}
+ c32(char16_t(0xD800));
+ c32(char16_t(0xDFFF));
c32(char16_t(0xE000));
c32(char16_t(u'☕'));
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 52f02c3..6488e38 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -176,7 +176,6 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
CHECK_PARSE_BOOL(BreakBeforeTernaryOperators);
CHECK_PARSE_BOOL(BreakStringLiterals);
CHECK_PARSE_BOOL(CompactNamespaces);
- CHECK_PARSE_BOOL(Cpp11BracedListStyle);
CHECK_PARSE_BOOL(DerivePointerAlignment);
CHECK_PARSE_BOOL_FIELD(DerivePointerAlignment, "DerivePointerBinding");
CHECK_PARSE_BOOL(DisableFormat);
@@ -1139,6 +1138,18 @@ TEST(ConfigParseTest, ParsesConfiguration) {
FormatStyle::SDS_Leave);
CHECK_PARSE("SeparateDefinitionBlocks: Never", SeparateDefinitionBlocks,
FormatStyle::SDS_Never);
+
+ CHECK_PARSE("Cpp11BracedListStyle: Block", Cpp11BracedListStyle,
+ FormatStyle::BLS_Block);
+ CHECK_PARSE("Cpp11BracedListStyle: FunctionCall", Cpp11BracedListStyle,
+ FormatStyle::BLS_FunctionCall);
+ CHECK_PARSE("Cpp11BracedListStyle: AlignFirstComment", Cpp11BracedListStyle,
+ FormatStyle::BLS_AlignFirstComment);
+ // For backward compatibility:
+ CHECK_PARSE("Cpp11BracedListStyle: false", Cpp11BracedListStyle,
+ FormatStyle::BLS_Block);
+ CHECK_PARSE("Cpp11BracedListStyle: true", Cpp11BracedListStyle,
+ FormatStyle::BLS_AlignFirstComment);
}
TEST(ConfigParseTest, ParsesConfigurationWithLanguages) {
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index b9ad930..0fb8139 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -14363,7 +14363,7 @@ TEST_F(FormatTest, LayoutCxx11BraceInitializers) {
BreakBeforeLambdaBody);
FormatStyle ExtraSpaces = getLLVMStyle();
- ExtraSpaces.Cpp11BracedListStyle = false;
+ ExtraSpaces.Cpp11BracedListStyle = FormatStyle::BLS_Block;
ExtraSpaces.ColumnLimit = 75;
verifyFormat("vector<int> x{ 1, 2, 3, 4 };", ExtraSpaces);
verifyFormat("vector<T> x{ {}, {}, {}, {} };", ExtraSpaces);
@@ -20346,7 +20346,7 @@ TEST_F(FormatTest, AlignConsecutiveDeclarations) {
" return 0;\n"
"}()};",
BracedAlign);
- BracedAlign.Cpp11BracedListStyle = false;
+ BracedAlign.Cpp11BracedListStyle = FormatStyle::BLS_Block;
verifyFormat("const auto result{ []() {\n"
" const auto something = 1;\n"
" return 2;\n"
@@ -21953,14 +21953,14 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresRightAlignment) {
"});",
Style);
- Style.Cpp11BracedListStyle = false;
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
verifyFormat("struct test demo[] = {\n"
" { 56, 23, \"hello\" },\n"
" { -1, 93463, \"world\" },\n"
" { 7, 5, \"!!\" }\n"
"};",
Style);
- Style.Cpp11BracedListStyle = true;
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment;
Style.ColumnLimit = 0;
verifyFormat(
@@ -22220,14 +22220,14 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresLeftAlignment) {
" };",
Style);
- Style.Cpp11BracedListStyle = false;
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
verifyFormat("struct test demo[] = {\n"
" { 56, 23, \"hello\" },\n"
" { -1, 93463, \"world\" },\n"
" { 7, 5, \"!!\" }\n"
"};",
Style);
- Style.Cpp11BracedListStyle = true;
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment;
Style.ColumnLimit = 0;
verifyFormat(
diff --git a/clang/unittests/Format/FormatTestCSharp.cpp b/clang/unittests/Format/FormatTestCSharp.cpp
index ea85ed6..d7fb15d 100644
--- a/clang/unittests/Format/FormatTestCSharp.cpp
+++ b/clang/unittests/Format/FormatTestCSharp.cpp
@@ -1194,7 +1194,7 @@ TEST_F(FormatTestCSharp, CSharpSpaces) {
Style.SpaceBeforeSquareBrackets = false;
Style.SpacesInSquareBrackets = false;
Style.SpaceBeforeCpp11BracedList = true;
- Style.Cpp11BracedListStyle = false;
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
Style.SpacesInContainerLiterals = false;
Style.SpaceAfterCStyleCast = false;
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index 69026bc..fc80bf4 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -4699,6 +4699,58 @@ TEST_F(FormatTestComments, SplitCommentIntroducers) {
getLLVMStyleWithColumns(10)));
}
+TEST_F(FormatTestComments, LineCommentsOnStartOfFunctionCall) {
+ auto Style = getLLVMStyle();
+
+ EXPECT_EQ(Style.Cpp11BracedListStyle, FormatStyle::BLS_AlignFirstComment);
+ verifyFormat("Type name{// Comment\n"
+ " value};",
+ Style);
+
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
+ verifyFormat("Type name{ // Comment\n"
+ " value\n"
+ "};",
+ Style);
+
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_FunctionCall;
+ verifyFormat("Type name{ // Comment\n"
+ " value};",
+ Style);
+
+ verifyFormat("T foo( // Comment\n"
+ " arg);",
+ Style);
+
+ verifyFormat("T bar{ // Comment\n"
+ " arg};",
+ Style);
+
+ verifyFormat("T baz({ // Comment\n"
+ " arg});",
+ Style);
+
+ verifyFormat("T baz{{ // Comment\n"
+ " arg}};",
+ Style);
+
+ verifyFormat("T b0z(f( // Comment\n"
+ " arg));",
+ Style);
+
+ verifyFormat("T b0z(F{ // Comment\n"
+ " arg});",
+ Style);
+
+ verifyFormat("func( // Comment\n"
+ " arg);",
+ Style);
+
+ verifyFormat("func({ // Comment\n"
+ " arg});",
+ Style);
+}
+
} // end namespace
} // namespace test
} // end namespace format
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index 1275564..1416614b 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -236,7 +236,7 @@ TEST_F(FormatTestJava, ArrayInitializers) {
"};");
FormatStyle Style = getStyleWithColumns(65);
- Style.Cpp11BracedListStyle = false;
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
verifyFormat(
"expected = new int[] { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,\n"
" 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 };",
diff --git a/clang/unittests/Format/FormatTestTextProto.cpp b/clang/unittests/Format/FormatTestTextProto.cpp
index fd65c9a..6cddb838 100644
--- a/clang/unittests/Format/FormatTestTextProto.cpp
+++ b/clang/unittests/Format/FormatTestTextProto.cpp
@@ -514,7 +514,7 @@ TEST_F(FormatTestTextProto, FormatsRepeatedListInitializers) {
"key: value");
auto Style = getDefaultStyle();
- Style.Cpp11BracedListStyle = true;
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment;
verifyFormat("keys: [1]", Style);
}
diff --git a/clang/unittests/Format/FormatTestVerilog.cpp b/clang/unittests/Format/FormatTestVerilog.cpp
index 5c50ae6..63e2cadf 100644
--- a/clang/unittests/Format/FormatTestVerilog.cpp
+++ b/clang/unittests/Format/FormatTestVerilog.cpp
@@ -1287,7 +1287,7 @@ TEST_F(FormatTestVerilog, StringLiteral) {
getStyleWithColumns(getDefaultStyle(), 32));
// Space around braces should be correct.
auto Style = getStyleWithColumns(getDefaultStyle(), 24);
- Style.Cpp11BracedListStyle = false;
+ Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
verifyFormat(R"(x({ "xxxxxxxxxxxxxxxx ",
"xxxx" });)",
R"(x("xxxxxxxxxxxxxxxx xxxx");)", Style);
diff --git a/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp b/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp
index 6f02814..8d1b9ef1 100644
--- a/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp
@@ -7,13 +7,14 @@
// This code is based on the repro in https://github.com/google/sanitizers/issues/749
#include <cstdio>
#include <exception>
+#include <stdexcept>
-void throwInFunction() { throw std::exception("test2"); }
+void throwInFunction() { throw std::runtime_error("test2"); }
int main() {
// case 1: direct throw
try {
- throw std::exception("test1");
+ throw std::runtime_error("test1");
} catch (const std::exception &ex) {
puts(ex.what());
// CHECK: test1
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
index 2427da0..ed177ba 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
@@ -5,8 +5,10 @@ add_flang_library(FIROpenACCTransforms
FIROpenACCPassesIncGen
LINK_LIBS
+ FIRDialect
+
+ MLIR_LIBS
MLIRIR
MLIRPass
- FIRDialect
MLIROpenACCDialect
)
diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 642bf7d..2b1aa28 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -53,8 +53,7 @@ Implemented Papers
- P2711R1: Making multi-param constructors of ``views`` ``explicit`` (`Github <https://github.com/llvm/llvm-project/issues/105252>`__)
- P2770R0: Stashing stashing ``iterators`` for proper flattening (`Github <https://github.com/llvm/llvm-project/issues/105250>`__)
- P2655R3: ``common_reference_t`` of ``reference_wrapper`` Should Be a Reference Type (`Github <https://github.com/llvm/llvm-project/issues/105260>`__)
-- P2944R3: Comparisons for ``reference_wrapper`` (`Github <https://github.com/llvm/llvm-project/issues/105424>`__)
-- P3379R0: Constrain ``std::expected equality`` operators (`Github <https://github.com/llvm/llvm-project/issues/118135>`__)
+- P3379R0: Constrain ``std::expected`` equality operators (`Github <https://github.com/llvm/llvm-project/issues/118135>`__)
Improvements and New Features
-----------------------------
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 0fd7e53..1a7a2c7 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -44,6 +44,7 @@ Implemented Papers
- P3223R2: Making ``std::istream::ignore`` less surprising (`Github <https://llvm.org/PR148178>`__)
- P3060R3: Add ``std::views::indices(n)`` (`Github <https://llvm.org/PR148175>`__)
- P2835R7: Expose ``std::atomic_ref``'s object address (`Github <https://llvm.org/PR118377>`__)
+- P2944R3: Comparisons for ``reference_wrapper`` (`Github <https://llvm.org/PR105424>`__)
- P3168R2: Give ``std::optional`` Range Support (`Github <https://llvm.org/PR105430>`__)
Improvements and New Features
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 7bf7bc9..237217a 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -149,5 +149,5 @@
"`LWG3343 <https://wg21.link/LWG3343>`__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Adopted Yet","|Complete|","16","`#105356 <https://github.com/llvm/llvm-project/issues/105356>`__",""
"`LWG4139 <https://wg21.link/LWG4139>`__","§[time.zone.leap] recursive constraint in ``<=>``","Not Adopted Yet","|Complete|","20","`#118369 <https://github.com/llvm/llvm-project/issues/118369>`__",""
"`LWG3456 <https://wg21.link/LWG3456>`__","Pattern used by ``std::from_chars`` is underspecified (option B)","Not Adopted Yet","|Complete|","20","`#118370 <https://github.com/llvm/llvm-project/issues/118370>`__",""
-"`LWG3882 <https://wg21.link/LWG3882>`__","``tuple`` relational operators have confused friendships","Not Adopted Yet","|Complete|","21","The comparsion operators are constrained harder than the proposed resolution. libstdc++ and MSVC STL do the same.",""
+"`LWG3882 <https://wg21.link/LWG3882>`__","``tuple`` relational operators have confused friendships","Not Adopted Yet","|Complete|","22","The comparsion operators are constrained harder than the proposed resolution. libstdc++ and MSVC STL do the same.",""
"","","","","",""
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 9b83047..0eedc82 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -59,7 +59,7 @@
"`P2248R8 <https://wg21.link/P2248R8>`__","Enabling list-initialization for algorithms","2024-03 (Tokyo)","","","`#105421 <https://github.com/llvm/llvm-project/issues/105421>`__",""
"`P2810R4 <https://wg21.link/P2810R4>`__","``is_debugger_present`` ``is_replaceable``","2024-03 (Tokyo)","","","`#105422 <https://github.com/llvm/llvm-project/issues/105422>`__",""
"`P1068R11 <https://wg21.link/P1068R11>`__","Vector API for random number generation","2024-03 (Tokyo)","","","`#105423 <https://github.com/llvm/llvm-project/issues/105423>`__",""
-"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Complete|","21","`#105424 <https://github.com/llvm/llvm-project/issues/105424>`__","The changes to ``tuple``'s equality overload from P2165R4 are not yet implemented."
+"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Complete|","22","`#105424 <https://github.com/llvm/llvm-project/issues/105424>`__","The changes to ``tuple``'s equality overload from P2165R4 are not yet implemented."
"`P2642R6 <https://wg21.link/P2642R6>`__","Padded ``mdspan`` layouts","2024-03 (Tokyo)","","","`#105425 <https://github.com/llvm/llvm-project/issues/105425>`__",""
"`P3029R1 <https://wg21.link/P3029R1>`__","Better ``mdspan``'s CTAD","2024-03 (Tokyo)","|Complete|","19","`#105426 <https://github.com/llvm/llvm-project/issues/105426>`__",""
"","","","","",""
diff --git a/llvm/benchmarks/SpecialCaseListBM.cpp b/llvm/benchmarks/SpecialCaseListBM.cpp
index 00aa3cd..b5d8268 100644
--- a/llvm/benchmarks/SpecialCaseListBM.cpp
+++ b/llvm/benchmarks/SpecialCaseListBM.cpp
@@ -110,6 +110,26 @@ std::string genGlobAtBothSides(const std::vector<std::string> &Files) {
return S;
}
+std::string genGlobAtBothSidesAndMid(const std::vector<std::string> &Files) {
+ std::string S;
+ std::minstd_rand Rng(RNG_SEED);
+ for (std::string F : Files) {
+ std::uniform_int_distribution<> PosDistrib(0, F.size() - 1);
+ F[PosDistrib(Rng)] = '*';
+
+ std::uniform_int_distribution<> Ends(0, 1);
+ if (Ends(Rng)) {
+ F.back() = '*';
+ F.front() = '*';
+ }
+
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
void BM_Make_(
benchmark::State &state,
std::string (*GenerateCaseList)(const std::vector<std::string> &Files)) {
@@ -171,6 +191,9 @@ BENCHMARK_CAPTURE(BM_Make_, Mid__, genGlobInMid)
BENCHMARK_CAPTURE(BM_Make_, Both_, genGlobAtBothSides)
->RangeMultiplier(MAX_LIST_MUL)
->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, Mix__, genGlobAtBothSidesAndMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
BENCHMARK_CAPTURE(BM_True_, None_, genGlobNone)
->RangeMultiplier(MAX_LIST_MUL)
@@ -187,6 +210,9 @@ BENCHMARK_CAPTURE(BM_True_, Mid__, genGlobInMid)
BENCHMARK_CAPTURE(BM_True_, Both_, genGlobAtBothSides)
->RangeMultiplier(MAX_LIST_MUL)
->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, Mix__, genGlobAtBothSidesAndMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
BENCHMARK_CAPTURE(BM_False, None_, genGlobNone)
->RangeMultiplier(MAX_LIST_MUL)
@@ -203,5 +229,8 @@ BENCHMARK_CAPTURE(BM_False, Mid__, genGlobInMid)
BENCHMARK_CAPTURE(BM_False, Both_, genGlobAtBothSides)
->RangeMultiplier(MAX_LIST_MUL)
->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, Mix__, genGlobAtBothSidesAndMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
BENCHMARK_MAIN();
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 1af2761..1fbc41c 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -154,12 +154,9 @@ struct ResolveUnderlyingType {
using type = std::underlying_type_t<T>;
};
template <typename T> struct ResolveUnderlyingType<T, false> {
- using type = T;
-};
-template <> struct ResolveUnderlyingType<bool, false> {
- /// In case sizeof(bool) != 1, replace `void` by an additionnal
- /// std::conditional.
- using type = std::conditional_t<sizeof(bool) == 1, uint8_t, void>;
+ static_assert(!std::is_same_v<T, bool> || sizeof(bool) == 1,
+ "T being bool requires sizeof(bool) == 1.");
+ using type = std::conditional_t<std::is_same_v<T, bool>, uint8_t, T>;
};
} // namespace bitfields_details
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 4bda50f..25b5262 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -42,7 +42,7 @@ namespace detail {
// We extend a pair to allow users to override the bucket type with their own
// implementation without requiring two members.
template <typename KeyT, typename ValueT>
-struct DenseMapPair : public std::pair<KeyT, ValueT> {
+struct DenseMapPair : std::pair<KeyT, ValueT> {
using std::pair<KeyT, ValueT>::pair;
KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; }
diff --git a/llvm/include/llvm/ADT/DepthFirstIterator.h b/llvm/include/llvm/ADT/DepthFirstIterator.h
index 4ced758..3c54f32 100644
--- a/llvm/include/llvm/ADT/DepthFirstIterator.h
+++ b/llvm/include/llvm/ADT/DepthFirstIterator.h
@@ -66,8 +66,8 @@ public:
// one more method, completed, which is invoked when all children of a
// node have been processed. It is intended to distinguish of back and
// cross edges in the spanning tree but is not used in the common case.
-template <typename NodeRef, unsigned SmallSize=8>
-struct df_iterator_default_set : public SmallPtrSet<NodeRef, SmallSize> {
+template <typename NodeRef, unsigned SmallSize = 8>
+struct df_iterator_default_set : SmallPtrSet<NodeRef, SmallSize> {
using BaseSet = SmallPtrSet<NodeRef, SmallSize>;
using iterator = typename BaseSet::iterator;
@@ -235,8 +235,10 @@ iterator_range<df_iterator<T>> depth_first(const T& G) {
}
// Provide global definitions of external depth first iterators...
-template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
-struct df_ext_iterator : public df_iterator<T, SetTy, true> {
+template <class T,
+ class SetTy =
+ df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
+struct df_ext_iterator : df_iterator<T, SetTy, true> {
df_ext_iterator(const df_iterator<T, SetTy, true> &V)
: df_iterator<T, SetTy, true>(V) {}
};
@@ -262,7 +264,7 @@ template <class T,
class SetTy =
df_iterator_default_set<typename GraphTraits<T>::NodeRef>,
bool External = false>
-struct idf_iterator : public df_iterator<Inverse<T>, SetTy, External> {
+struct idf_iterator : df_iterator<Inverse<T>, SetTy, External> {
idf_iterator(const df_iterator<Inverse<T>, SetTy, External> &V)
: df_iterator<Inverse<T>, SetTy, External>(V) {}
};
@@ -284,8 +286,10 @@ iterator_range<idf_iterator<T>> inverse_depth_first(const T& G) {
}
// Provide global definitions of external inverse depth first iterators...
-template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
-struct idf_ext_iterator : public idf_iterator<T, SetTy, true> {
+template <class T,
+ class SetTy =
+ df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
+struct idf_ext_iterator : idf_iterator<T, SetTy, true> {
idf_ext_iterator(const idf_iterator<T, SetTy, true> &V)
: idf_iterator<T, SetTy, true>(V) {}
idf_ext_iterator(const df_iterator<Inverse<T>, SetTy, true> &V)
diff --git a/llvm/include/llvm/ADT/ImmutableSet.h b/llvm/include/llvm/ADT/ImmutableSet.h
index 310539f..8b2425e 100644
--- a/llvm/include/llvm/ADT/ImmutableSet.h
+++ b/llvm/include/llvm/ADT/ImmutableSet.h
@@ -931,8 +931,7 @@ struct ImutProfileInfo<T*> {
/// ImutContainerInfo - Generic definition of comparison operations for
/// elements of immutable containers that defaults to using
/// std::equal_to<> and std::less<> to perform comparison of elements.
-template <typename T>
-struct ImutContainerInfo : public ImutProfileInfo<T> {
+template <typename T> struct ImutContainerInfo : ImutProfileInfo<T> {
using value_type = typename ImutProfileInfo<T>::value_type;
using value_type_ref = typename ImutProfileInfo<T>::value_type_ref;
using key_type = value_type;
@@ -957,8 +956,7 @@ struct ImutContainerInfo : public ImutProfileInfo<T> {
/// ImutContainerInfo - Specialization for pointer values to treat pointers
/// as references to unique objects. Pointers are thus compared by
/// their addresses.
-template <typename T>
-struct ImutContainerInfo<T*> : public ImutProfileInfo<T*> {
+template <typename T> struct ImutContainerInfo<T *> : ImutProfileInfo<T *> {
using value_type = typename ImutProfileInfo<T*>::value_type;
using value_type_ref = typename ImutProfileInfo<T*>::value_type_ref;
using key_type = value_type;
diff --git a/llvm/include/llvm/ADT/PostOrderIterator.h b/llvm/include/llvm/ADT/PostOrderIterator.h
index 1cbd3c1..d9aa452 100644
--- a/llvm/include/llvm/ADT/PostOrderIterator.h
+++ b/llvm/include/llvm/ADT/PostOrderIterator.h
@@ -200,7 +200,7 @@ template <class T> iterator_range<po_iterator<T>> post_order(const T &G) {
// Provide global definitions of external postorder iterators...
template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>>
-struct po_ext_iterator : public po_iterator<T, SetType, true> {
+struct po_ext_iterator : po_iterator<T, SetType, true> {
po_ext_iterator(const po_iterator<T, SetType, true> &V) :
po_iterator<T, SetType, true>(V) {}
};
@@ -223,7 +223,7 @@ iterator_range<po_ext_iterator<T, SetType>> post_order_ext(const T &G, SetType &
// Provide global definitions of inverse post order iterators...
template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>,
bool External = false>
-struct ipo_iterator : public po_iterator<Inverse<T>, SetType, External> {
+struct ipo_iterator : po_iterator<Inverse<T>, SetType, External> {
ipo_iterator(const po_iterator<Inverse<T>, SetType, External> &V) :
po_iterator<Inverse<T>, SetType, External> (V) {}
};
@@ -245,7 +245,7 @@ iterator_range<ipo_iterator<T>> inverse_post_order(const T &G) {
// Provide global definitions of external inverse postorder iterators...
template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>>
-struct ipo_ext_iterator : public ipo_iterator<T, SetType, true> {
+struct ipo_ext_iterator : ipo_iterator<T, SetType, true> {
ipo_ext_iterator(const ipo_iterator<T, SetType, true> &V) :
ipo_iterator<T, SetType, true>(V) {}
ipo_ext_iterator(const po_iterator<Inverse<T>, SetType, true> &V) :
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 658f262..a9841c6 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -674,7 +674,7 @@ using zip_traits = iterator_facade_base<
ReferenceTupleType *, ReferenceTupleType>;
template <typename ZipType, typename ReferenceTupleType, typename... Iters>
-struct zip_common : public zip_traits<ZipType, ReferenceTupleType, Iters...> {
+struct zip_common : zip_traits<ZipType, ReferenceTupleType, Iters...> {
using Base = zip_traits<ZipType, ReferenceTupleType, Iters...>;
using IndexSequence = std::index_sequence_for<Iters...>;
using value_type = typename Base::value_type;
diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index f588a77..8e7c8b3 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -532,18 +532,8 @@ class SmallPtrSet : public SmallPtrSetImpl<PtrType> {
using BaseT = SmallPtrSetImpl<PtrType>;
- // A constexpr version of llvm::bit_ceil.
- // TODO: Replace this with std::bit_ceil once C++20 is available.
- static constexpr size_t RoundUpToPowerOfTwo(size_t X) {
- size_t C = 1;
- size_t CMax = C << (std::numeric_limits<size_t>::digits - 1);
- while (C < X && C < CMax)
- C <<= 1;
- return C;
- }
-
// Make sure that SmallSize is a power of two, round up if not.
- static constexpr size_t SmallSizePowTwo = RoundUpToPowerOfTwo(SmallSize);
+ static constexpr size_t SmallSizePowTwo = llvm::bit_ceil_constexpr(SmallSize);
/// SmallStorage - Fixed size storage used in 'small mode'.
const void *SmallStorage[SmallSizePowTwo];
diff --git a/llvm/include/llvm/ADT/bit.h b/llvm/include/llvm/ADT/bit.h
index 8c68d0a..8b60b69 100644
--- a/llvm/include/llvm/ADT/bit.h
+++ b/llvm/include/llvm/ADT/bit.h
@@ -336,6 +336,21 @@ template <typename T> [[nodiscard]] T bit_ceil(T Value) {
return T(1) << llvm::bit_width<T>(Value - 1u);
}
+/// Returns the smallest integral power of two no smaller than Value if Value is
+/// nonzero. Returns 1 otherwise.
+///
+/// Ex. bit_ceil(5) == 8.
+///
+/// The return value is undefined if the input is larger than the largest power
+/// of two representable in T.
+template <typename T> [[nodiscard]] constexpr T bit_ceil_constexpr(T Value) {
+ static_assert(std::is_unsigned_v<T>,
+ "Only unsigned integral types are allowed.");
+ if (Value < 2)
+ return 1;
+ return T(1) << llvm::bit_width_constexpr<T>(Value - 1u);
+}
+
template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
[[nodiscard]] constexpr T rotl(T V, int R) {
constexpr unsigned N = std::numeric_limits<T>::digits;
diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h
index a4ca54e..f9d7c76 100644
--- a/llvm/include/llvm/Support/Alignment.h
+++ b/llvm/include/llvm/Support/Alignment.h
@@ -103,7 +103,7 @@ inline Align assumeAligned(uint64_t Value) {
/// This struct is a compact representation of a valid (power of two) or
/// undefined (0) alignment.
-struct MaybeAlign : public std::optional<Align> {
+struct MaybeAlign : std::optional<Align> {
private:
using UP = std::optional<Align>;
diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 2a9a149..6f6df2e 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -340,7 +340,7 @@ struct ValueFromPointerCast
/// during the cast. It's also a good example of how to implement a move-only
/// cast.
template <typename To, typename From, typename Derived = void>
-struct UniquePtrCast : public CastIsPossible<To, From *> {
+struct UniquePtrCast : CastIsPossible<To, From *> {
using Self = detail::SelfType<Derived, UniquePtrCast<To, From>>;
using CastResultType = std::unique_ptr<
std::remove_reference_t<typename cast_retty<To, From>::ret_type>>;
@@ -473,7 +473,7 @@ struct ForwardToPointerCast {
// take advantage of the cast traits whenever possible!
template <typename To, typename From, typename Enable = void>
-struct CastInfo : public CastIsPossible<To, From> {
+struct CastInfo : CastIsPossible<To, From> {
using Self = CastInfo<To, From, Enable>;
using CastReturnType = typename cast_retty<To, From>::ret_type;
@@ -536,8 +536,7 @@ struct CastInfo<To, std::unique_ptr<From>> : public UniquePtrCast<To, From> {};
/// the input is std::optional<From> that the output can be std::optional<To>.
/// If that's not the case, specialize CastInfo for your use case.
template <typename To, typename From>
-struct CastInfo<To, std::optional<From>> : public OptionalValueCast<To, From> {
-};
+struct CastInfo<To, std::optional<From>> : OptionalValueCast<To, From> {};
/// isa<X> - Return true if the parameter to the template is an instance of one
/// of the template type arguments. Used like this:
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index dd05c53..5a5f00e 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -549,7 +549,7 @@ template <class DataType> struct OptionValue;
// The default value safely does nothing. Option value printing is only
// best-effort.
template <class DataType, bool isClass>
-struct OptionValueBase : public GenericOptionValue {
+struct OptionValueBase : GenericOptionValue {
// Temporary storage for argument passing.
using WrapperType = OptionValue<DataType>;
diff --git a/llvm/include/llvm/Support/DOTGraphTraits.h b/llvm/include/llvm/Support/DOTGraphTraits.h
index bf30aa4..3b9fe00 100644
--- a/llvm/include/llvm/Support/DOTGraphTraits.h
+++ b/llvm/include/llvm/Support/DOTGraphTraits.h
@@ -162,8 +162,7 @@ public:
/// graphs are converted to 'dot' graphs. When specializing, you may inherit
/// from DefaultDOTGraphTraits if you don't need to override everything.
///
-template <typename Ty>
-struct DOTGraphTraits : public DefaultDOTGraphTraits {
+template <typename Ty> struct DOTGraphTraits : DefaultDOTGraphTraits {
using DefaultDOTGraphTraits::DefaultDOTGraphTraits;
};
diff --git a/llvm/include/llvm/Support/ELFAttributes.h b/llvm/include/llvm/Support/ELFAttributes.h
index 270246f..5771a84 100644
--- a/llvm/include/llvm/Support/ELFAttributes.h
+++ b/llvm/include/llvm/Support/ELFAttributes.h
@@ -48,8 +48,6 @@ struct SubsectionAndTagToTagName {
StringRef SubsectionName;
unsigned Tag;
StringRef TagName;
- SubsectionAndTagToTagName(StringRef SN, unsigned Tg, StringRef TN)
- : SubsectionName(SN), Tag(Tg), TagName(TN) {}
};
namespace ELFAttrs {
diff --git a/llvm/include/llvm/Support/LSP/Protocol.h b/llvm/include/llvm/Support/LSP/Protocol.h
index 93b82f1..e38203a 100644
--- a/llvm/include/llvm/Support/LSP/Protocol.h
+++ b/llvm/include/llvm/Support/LSP/Protocol.h
@@ -449,7 +449,7 @@ struct ReferenceContext {
bool fromJSON(const llvm::json::Value &value, ReferenceContext &result,
llvm::json::Path path);
-struct ReferenceParams : public TextDocumentPositionParams {
+struct ReferenceParams : TextDocumentPositionParams {
ReferenceContext context;
};
diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
index ed29826..4ba3867 100644
--- a/llvm/include/llvm/Support/MD5.h
+++ b/llvm/include/llvm/Support/MD5.h
@@ -41,7 +41,7 @@ template <typename T> class ArrayRef;
class MD5 {
public:
- struct MD5Result : public std::array<uint8_t, 16> {
+ struct MD5Result : std::array<uint8_t, 16> {
LLVM_ABI SmallString<32> digest() const;
uint64_t low() const {
diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h
index 40709d4..a4ed712 100644
--- a/llvm/include/llvm/Support/Timer.h
+++ b/llvm/include/llvm/Support/Timer.h
@@ -167,7 +167,7 @@ public:
/// you to declare a new timer, AND specify the region to time, all in one
/// statement. All timers with the same name are merged. This is primarily
/// used for debugging and for hunting performance problems.
-struct NamedRegionTimer : public TimeRegion {
+struct NamedRegionTimer : TimeRegion {
LLVM_ABI explicit NamedRegionTimer(StringRef Name, StringRef Description,
StringRef GroupName,
StringRef GroupDescription,
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 979a8b0..4b22c68 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include <algorithm>
+#include <array>
namespace llvm {
@@ -45,7 +46,7 @@ struct GCNRegPressure {
return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
}
- void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+ void clear() { Value.fill(0); }
unsigned getNumRegs(RegKind Kind) const {
assert(Kind < TOTAL_KINDS);
@@ -127,9 +128,7 @@ struct GCNRegPressure {
bool less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
- bool operator==(const GCNRegPressure &O) const {
- return std::equal(&Value[0], &Value[ValueArraySize], O.Value);
- }
+ bool operator==(const GCNRegPressure &O) const { return Value == O.Value; }
bool operator!=(const GCNRegPressure &O) const {
return !(*this == O);
@@ -160,7 +159,7 @@ private:
/// Pressure for all register kinds (first all regular registers kinds, then
/// all tuple register kinds).
- unsigned Value[ValueArraySize];
+ std::array<unsigned, ValueArraySize> Value;
static unsigned getRegKind(const TargetRegisterClass *RC,
const SIRegisterInfo *STI);
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c9..09ef6ac 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -45,6 +45,9 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
+ // 32-bit ABS is legal for AMDGPU except for R600
+ setOperationAction(ISD::ABS, MVT::i32, Expand);
+
// EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
// spaces, so it is custom lowered to handle those where it isn't.
for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 942e784..50447f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10626,59 +10626,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;
- const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
- this]() -> bool {
- if (CmpValue != 0)
- return false;
-
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
- return false;
-
- bool CanOptimize = false;
-
- // For S_OP that set SCC = DST!=0, do the transformation
- //
- // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
- if (setsSCCifResultIsNonZero(*Def))
- CanOptimize = true;
-
- // s_cmp_lg_* is redundant because the SCC input value for S_CSELECT* has
- // the same value that will be calculated by s_cmp_lg_*
- //
- // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
- // imm), 0)
- if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
- Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
- bool Op1IsNonZeroImm =
- Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
- bool Op2IsZeroImm =
- Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
- if (Op1IsNonZeroImm && Op2IsZeroImm)
- CanOptimize = true;
- }
-
- if (!CanOptimize)
- return false;
-
- MachineInstr *KillsSCC = nullptr;
- for (MachineInstr &MI :
- make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
- if (MI.modifiesRegister(AMDGPU::SCC, &RI))
- return false;
- if (MI.killsRegister(AMDGPU::SCC, &RI))
- KillsSCC = &MI;
- }
-
- if (MachineOperand *SccDef =
- Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
- SccDef->setIsDead(false);
- if (KillsSCC)
- KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
- CmpInstr.eraseFromParent();
- return true;
- };
-
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
bool IsReversible, bool IsSigned) -> bool {
@@ -10753,20 +10700,16 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- MachineInstr *KillsSCC = nullptr;
- for (MachineInstr &MI :
- make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
- if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+ for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
+ I != E; ++I) {
+ if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
+ I->killsRegister(AMDGPU::SCC, &RI))
return false;
- if (MI.killsRegister(AMDGPU::SCC, &RI))
- KillsSCC = &MI;
}
MachineOperand *SccDef =
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
SccDef->setIsDead(false);
- if (KillsSCC)
- KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
CmpInstr.eraseFromParent();
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10810,7 +10753,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
- return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
+ return optimizeCmpAnd(0, 32, true, false);
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
@@ -10818,7 +10761,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
- return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
+ return optimizeCmpAnd(0, 64, true, false);
}
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index ee99a74..df27ec1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -709,30 +709,6 @@ public:
}
}
- static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
- if (!MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
- return false;
- // Compares have no result
- if (MI.isCompare())
- return false;
- switch (MI.getOpcode()) {
- default:
- return true;
- case AMDGPU::S_ADD_I32:
- case AMDGPU::S_ADD_U32:
- case AMDGPU::S_ADDC_U32:
- case AMDGPU::S_SUB_I32:
- case AMDGPU::S_SUB_U32:
- case AMDGPU::S_SUBB_U32:
- case AMDGPU::S_MIN_I32:
- case AMDGPU::S_MIN_U32:
- case AMDGPU::S_MAX_I32:
- case AMDGPU::S_MAX_U32:
- case AMDGPU::S_ADDK_I32:
- return false;
- }
- }
-
static bool isEXP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index b80c3c9..4947d03 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/Instructions.h"
@@ -760,6 +761,7 @@ private:
void handleCallArguments(CallBase &CB);
void handleExtractOfWithOverflow(ExtractValueInst &EVI,
const WithOverflowInst *WO, unsigned Idx);
+ bool isInstFullyOverDefined(Instruction &Inst);
private:
friend class InstVisitor<SCCPInstVisitor>;
@@ -1374,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
// 7. If a conditional branch has a value that is overdefined, make all
// successors executable.
void SCCPInstVisitor::visitPHINode(PHINode &PN) {
- // If this PN returns a struct, just mark the result overdefined.
- // TODO: We could do a lot better than this if code actually uses this.
- if (PN.getType()->isStructTy())
- return (void)markOverdefined(&PN);
-
- if (getValueState(&PN).isOverdefined())
- return; // Quick exit
-
// Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
// and slow us down a lot. Just mark them overdefined.
if (PN.getNumIncomingValues() > 64)
return (void)markOverdefined(&PN);
- unsigned NumActiveIncoming = 0;
+ if (isInstFullyOverDefined(PN))
+ return;
+ SmallVector<unsigned> FeasibleIncomingIndices;
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+ if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+ continue;
+ FeasibleIncomingIndices.push_back(i);
+ }
// Look at all of the executable operands of the PHI node. If any of them
// are overdefined, the PHI becomes overdefined as well. If they are all
// constant, and they agree with each other, the PHI becomes the identical
// constant. If they are constant and don't agree, the PHI is a constant
// range. If there are no executable operands, the PHI remains unknown.
- ValueLatticeElement PhiState = getValueState(&PN);
- for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
- if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
- continue;
-
- const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
- PhiState.mergeIn(IV);
- NumActiveIncoming++;
- if (PhiState.isOverdefined())
- break;
+ if (StructType *STy = dyn_cast<StructType>(PN.getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ ValueLatticeElement PhiState = getStructValueState(&PN, i);
+ if (PhiState.isOverdefined())
+ continue;
+ for (unsigned j : FeasibleIncomingIndices) {
+ const ValueLatticeElement &IV =
+ getStructValueState(PN.getIncomingValue(j), i);
+ PhiState.mergeIn(IV);
+ if (PhiState.isOverdefined())
+ break;
+ }
+ ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i);
+ mergeInValue(PhiStateRef, &PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ FeasibleIncomingIndices.size() + 1));
+ PhiStateRef.setNumRangeExtensions(
+ std::max((unsigned)FeasibleIncomingIndices.size(),
+ PhiStateRef.getNumRangeExtensions()));
+ }
+ } else {
+ ValueLatticeElement PhiState = getValueState(&PN);
+ for (unsigned i : FeasibleIncomingIndices) {
+ const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
+ PhiState.mergeIn(IV);
+ if (PhiState.isOverdefined())
+ break;
+ }
+ // We allow up to 1 range extension per active incoming value and one
+ // additional extension. Note that we manually adjust the number of range
+ // extensions to match the number of active incoming values. This helps to
+ // limit multiple extensions caused by the same incoming value, if other
+ // incoming values are equal.
+ ValueLatticeElement &PhiStateRef = ValueState[&PN];
+ mergeInValue(PhiStateRef, &PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ FeasibleIncomingIndices.size() + 1));
+ PhiStateRef.setNumRangeExtensions(
+ std::max((unsigned)FeasibleIncomingIndices.size(),
+ PhiStateRef.getNumRangeExtensions()));
}
-
- // We allow up to 1 range extension per active incoming value and one
- // additional extension. Note that we manually adjust the number of range
- // extensions to match the number of active incoming values. This helps to
- // limit multiple extensions caused by the same incoming value, if other
- // incoming values are equal.
- ValueLatticeElement &PhiStateRef = ValueState[&PN];
- mergeInValue(PhiStateRef, &PN, PhiState,
- ValueLatticeElement::MergeOptions().setMaxWidenSteps(
- NumActiveIncoming + 1));
- PhiStateRef.setNumRangeExtensions(
- std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
}
void SCCPInstVisitor::visitReturnInst(ReturnInst &I) {
@@ -2127,6 +2146,21 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
}
}
+bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) {
+ // For structure Type, we handle each member separately.
+ // A structure object won't be considered as overdefined when
+ // there is at least one member that is not overdefined.
+ if (StructType *STy = dyn_cast<StructType>(Inst.getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) {
+ if (!getStructValueState(&Inst, i).isOverdefined())
+ return false;
+ }
+ return true;
+ }
+
+ return getValueState(&Inst).isOverdefined();
+}
+
void SCCPInstVisitor::solve() {
// Process the work lists until they are empty!
while (!BBWorkList.empty() || !InstWorkList.empty()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0e0b042..84d2ea6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -407,6 +407,10 @@ public:
VPBasicBlock *getParent() { return Parent; }
const VPBasicBlock *getParent() const { return Parent; }
+ /// \return the VPRegionBlock which the recipe belongs to.
+ VPRegionBlock *getRegion();
+ const VPRegionBlock *getRegion() const;
+
/// The method which generates the output IR instructions that correspond to
/// this VPRecipe, thereby "executing" the VPlan.
virtual void execute(VPTransformState &State) = 0;
@@ -4075,6 +4079,14 @@ public:
}
};
+inline VPRegionBlock *VPRecipeBase::getRegion() {
+ return getParent()->getParent();
+}
+
+inline const VPRegionBlock *VPRecipeBase::getRegion() const {
+ return getParent()->getParent();
+}
+
/// VPlan models a candidate for vectorization, encoding various decisions take
/// to produce efficient output IR, including which branches, basic-blocks and
/// output IR instructions to generate, and their cost. VPlan holds a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f413c63..7e074c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -377,7 +377,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
#ifndef NDEBUG
auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
- auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
+ VPRegionBlock *Region = R->getRegion();
if (Region && Region->isReplicator()) {
assert(Region->getNumSuccessors() == 1 &&
Region->getNumPredecessors() == 1 && "Expected SESE region!");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a98c75..d1e67e6b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2352,7 +2352,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = getParent()->getParent()->getCanonicalIV();
+ auto *CanIV = getRegion()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
getScalarType() == CanIV->getScalarType();
}
@@ -3076,7 +3076,7 @@ static void scalarizeInstruction(const Instruction *Instr,
State.AC->registerAssumption(II);
assert(
- (RepRecipe->getParent()->getParent() ||
+ (RepRecipe->getRegion() ||
!RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
all_of(RepRecipe->operands(),
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
@@ -3268,7 +3268,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
to_vector(operands()), VF);
// If the recipe is not predicated (i.e. not in a replicate region), return
// the scalar cost. Otherwise handle predicated cost.
- if (!getParent()->getParent()->isReplicator())
+ if (!getRegion()->isReplicator())
return ScalarCost;
// Account for the phi nodes that we will create.
@@ -3284,7 +3284,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
case Instruction::Store: {
// TODO: See getMemInstScalarizationCost for how to handle replicating and
// predicated cases.
- const VPRegionBlock *ParentRegion = getParent()->getParent();
+ const VPRegionBlock *ParentRegion = getRegion();
if (ParentRegion && ParentRegion->isReplicator())
break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cae9aee8..f5f616f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1858,8 +1858,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
return nullptr;
VPRegionBlock *EnclosingLoopRegion =
HoistCandidate->getParent()->getEnclosingLoopRegion();
- assert((!HoistCandidate->getParent()->getParent() ||
- HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
+ assert((!HoistCandidate->getRegion() ||
+ HoistCandidate->getRegion() == EnclosingLoopRegion) &&
"CFG in VPlan should still be flat, without replicate regions");
// Hoist candidate was already visited, no need to hoist.
if (!Visited.insert(HoistCandidate).second)
@@ -2898,7 +2898,7 @@ void VPlanTransforms::replaceSymbolicStrides(
// evolution.
auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
auto *R = cast<VPRecipeBase>(&U);
- return R->getParent()->getParent() ||
+ return R->getRegion() ||
R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
};
ValueToSCEVMapTy RewriteMap;
@@ -3803,8 +3803,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
- VPRegionBlock *ParentRegion =
- cast<VPRecipeBase>(U)->getParent()->getParent();
+ VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
};
if ((isa<VPReplicateRecipe>(DefR) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index cf95ac0..9a2497e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
return true;
if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
- const VPRegionBlock *RegionOfR = Rep->getParent()->getParent();
+ const VPRegionBlock *RegionOfR = Rep->getRegion();
// Don't consider recipes in replicate regions as uniform yet; their first
// lane cannot be accessed when executing the replicate region for other
// lanes.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 7714c03..5171403 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -140,6 +140,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -344,6 +345,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b81669..7b01f13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -143,6 +143,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -347,6 +348,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i32.ll b/llvm/test/CodeGen/AMDGPU/abs_i32.ll
new file mode 100644
index 0000000..b53047f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/abs_i32.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600 %s
+
+define amdgpu_kernel void @abs_v1(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_abs_i32 s2, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; R600-LABEL: abs_v1:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, KC0[2].Z,
+; R600-NEXT: SUB_INT * T1.W, 0.0, PV.W,
+; R600-NEXT: MAX_INT T0.X, T0.W, PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
+ store i32 %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @abs_v2(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_abs_i32 s2, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; R600-LABEL: abs_v2:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[2].Z,
+; R600-NEXT: MAX_INT T0.X, KC0[2].Z, PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %neg = sub i32 0, %arg
+ %cond = icmp sgt i32 %arg, %neg
+ %res = select i1 %cond, i32 %arg, i32 %neg
+ store i32 %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @abs_v3(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_abs_i32 s2, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; R600-LABEL: abs_v3:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[2].Z,
+; R600-NEXT: MAX_INT T0.X, PV.W, KC0[2].Z,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %neg = sub i32 0, %arg
+ %cond = icmp sgt i32 %neg, %arg
+ %res = select i1 %cond, i32 %neg, i32 %arg
+ store i32 %res, ptr addrspace(1) %out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index 8088c1b..b72eba8 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -180,7 +180,11 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
; CHECK-LABEL: s_add64_32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
+; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s2, s4, 0
; CHECK-NEXT: ; return to shader part epilog
%sum64 = add i64 %val64A, %val64B
@@ -195,10 +199,14 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_add_u32 s6, s2, s6
-; CHECK-NEXT: s_addc_u32 s7, s3, s7
+; CHECK-NEXT: s_add_u32 s10, s2, s6
+; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
+; CHECK-NEXT: s_addc_u32 s8, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_add_u32 s0, s0, s4
+; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -207,8 +215,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s6
-; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: v_mov_b32_e32 v4, s10
+; CHECK-NEXT: v_mov_b32_e32 v5, s8
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -225,10 +233,14 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sub_u32 s6, s2, s6
-; CHECK-NEXT: s_subb_u32 s7, s3, s7
+; CHECK-NEXT: s_sub_u32 s10, s2, s6
+; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
+; CHECK-NEXT: s_subb_u32 s8, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_sub_u32 s0, s0, s4
+; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_subb_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -237,8 +249,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s6
-; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: v_mov_b32_e32 v4, s10
+; CHECK-NEXT: v_mov_b32_e32 v5, s8
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -256,6 +268,8 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
; CHECK-LABEL: s_uadd_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
+; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -278,6 +292,8 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -323,6 +339,8 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -345,6 +363,8 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_n1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, -1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, -1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 51df8c3..948811e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7821,9 +7821,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s0, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -7854,6 +7855,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s15, s16, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s12
; GFX6-NEXT: s_ashr_i32 s12, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s12
@@ -7879,50 +7881,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_addc_u32 s4, s4, 0
; GFX6-NEXT: s_mul_i32 s14, s7, s14
-; GFX6-NEXT: s_add_u32 s16, s1, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: s_add_u32 s14, s1, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT: s_addc_u32 s17, 0, s4
+; GFX6-NEXT: s_addc_u32 s15, 0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_mul_i32 s4, s10, s17
+; GFX6-NEXT: s_mul_i32 s4, s10, s15
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
-; GFX6-NEXT: s_mul_i32 s5, s11, s16
-; GFX6-NEXT: s_add_i32 s18, s4, s5
-; GFX6-NEXT: s_sub_i32 s14, s7, s18
-; GFX6-NEXT: s_mul_i32 s4, s10, s16
+; GFX6-NEXT: s_mul_i32 s5, s11, s14
+; GFX6-NEXT: s_add_i32 s16, s4, s5
+; GFX6-NEXT: s_sub_i32 s17, s7, s16
+; GFX6-NEXT: s_mul_i32 s4, s10, s14
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s4, s5
-; GFX6-NEXT: s_subb_u32 s19, s14, s11
-; GFX6-NEXT: s_sub_u32 s20, s6, s10
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s11
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s20, s10
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s11
-; GFX6-NEXT: s_cselect_b32 s14, s19, s15
-; GFX6-NEXT: s_add_u32 s15, s16, 1
-; GFX6-NEXT: s_addc_u32 s19, s17, 0
-; GFX6-NEXT: s_add_u32 s20, s16, 2
-; GFX6-NEXT: s_addc_u32 s21, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s20, s15
-; GFX6-NEXT: s_cselect_b32 s15, s21, s19
+; GFX6-NEXT: s_or_b32 s18, s4, s5
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_subb_u32 s17, s17, s11
+; GFX6-NEXT: s_sub_u32 s19, s6, s10
+; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_subb_u32 s4, s7, s18
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_subb_u32 s4, s17, 0
; GFX6-NEXT: s_cmp_ge_u32 s4, s11
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s6, s10
-; GFX6-NEXT: s_cselect_b32 s6, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s10
+; GFX6-NEXT: s_cselect_b32 s17, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s4, s11
-; GFX6-NEXT: s_cselect_b32 s4, s6, s5
+; GFX6-NEXT: s_cselect_b32 s4, s17, s5
+; GFX6-NEXT: s_add_u32 s5, s14, 1
+; GFX6-NEXT: s_addc_u32 s17, s15, 0
+; GFX6-NEXT: s_add_u32 s19, s14, 2
+; GFX6-NEXT: s_addc_u32 s20, s15, 0
; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_cselect_b32 s5, s15, s17
-; GFX6-NEXT: s_cselect_b32 s4, s14, s16
+; GFX6-NEXT: s_cselect_b32 s4, s19, s5
+; GFX6-NEXT: s_cselect_b32 s5, s20, s17
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_subb_u32 s7, s7, s16
+; GFX6-NEXT: s_cmp_ge_u32 s7, s11
+; GFX6-NEXT: s_cselect_b32 s16, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s6, s10
+; GFX6-NEXT: s_cselect_b32 s6, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s7, s11
+; GFX6-NEXT: s_cselect_b32 s6, s6, s16
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
+; GFX6-NEXT: s_cselect_b32 s5, s5, s15
+; GFX6-NEXT: s_cselect_b32 s4, s4, s14
; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_sub_u32 s4, s4, s6
@@ -7945,8 +7949,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s4, 0, s8
-; GFX9-NEXT: s_subb_u32 s5, 0, s9
+; GFX9-NEXT: s_sub_u32 s10, 0, s8
+; GFX9-NEXT: s_subb_u32 s11, 0, s9
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -7956,52 +7960,56 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s10, v2
-; GFX9-NEXT: v_readfirstlane_b32 s11, v1
-; GFX9-NEXT: s_mul_i32 s12, s4, s10
-; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11
-; GFX9-NEXT: s_mul_i32 s13, s5, s11
-; GFX9-NEXT: s_add_i32 s12, s14, s12
-; GFX9-NEXT: s_mul_i32 s15, s4, s11
-; GFX9-NEXT: s_add_i32 s12, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15
-; GFX9-NEXT: s_mul_i32 s16, s11, s12
-; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
+; GFX9-NEXT: v_readfirstlane_b32 s12, v2
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mul_i32 s5, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4
+; GFX9-NEXT: s_mul_i32 s13, s11, s4
+; GFX9-NEXT: s_add_i32 s5, s14, s5
+; GFX9-NEXT: s_mul_i32 s15, s10, s4
+; GFX9-NEXT: s_add_i32 s5, s5, s13
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15
+; GFX9-NEXT: s_mul_i32 s16, s4, s5
+; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5
; GFX9-NEXT: s_add_u32 s14, s14, s16
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
-; GFX9-NEXT: s_mul_i32 s15, s10, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT: s_mul_i32 s15, s12, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s12, s10, s12
-; GFX9-NEXT: s_add_u32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s5, s12, s5
+; GFX9-NEXT: s_add_u32 s5, s13, s5
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s11, s11, s12
-; GFX9-NEXT: s_addc_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_i32 s12, s4, s10
-; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11
-; GFX9-NEXT: s_add_i32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s5, s5, s11
-; GFX9-NEXT: s_add_i32 s12, s12, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s11
-; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4
-; GFX9-NEXT: s_mul_i32 s14, s10, s4
-; GFX9-NEXT: s_mul_i32 s16, s11, s12
-; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4
-; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
-; GFX9-NEXT: s_add_u32 s4, s4, s16
+; GFX9-NEXT: s_add_u32 s14, s4, s5
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s12, s12, s13
+; GFX9-NEXT: s_mul_i32 s4, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14
+; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s11, s11, s14
+; GFX9-NEXT: s_add_i32 s4, s4, s11
+; GFX9-NEXT: s_mul_i32 s10, s10, s14
+; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
+; GFX9-NEXT: s_mul_i32 s13, s12, s10
+; GFX9-NEXT: s_mul_i32 s16, s14, s4
+; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
+; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
+; GFX9-NEXT: s_add_u32 s10, s10, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s4, s4, s14
-; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12
-; GFX9-NEXT: s_addc_u32 s4, s15, s13
+; GFX9-NEXT: s_add_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
+; GFX9-NEXT: s_addc_u32 s10, s15, s11
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s12, s10, s12
-; GFX9-NEXT: s_add_u32 s4, s4, s12
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_u32 s11, s11, s4
-; GFX9-NEXT: s_addc_u32 s10, s10, s5
+; GFX9-NEXT: s_mul_i32 s4, s12, s4
+; GFX9-NEXT: s_add_u32 s4, s10, s4
+; GFX9-NEXT: s_addc_u32 s10, 0, s5
+; GFX9-NEXT: s_add_u32 s11, s14, s4
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s10, s12, s10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -8020,35 +8028,38 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_addc_u32 s11, s12, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
; GFX9-NEXT: s_mul_i32 s10, s3, s10
-; GFX9-NEXT: s_add_u32 s13, s11, s10
-; GFX9-NEXT: s_addc_u32 s12, 0, s12
-; GFX9-NEXT: s_mul_i32 s10, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13
+; GFX9-NEXT: s_add_u32 s14, s11, s10
+; GFX9-NEXT: s_addc_u32 s15, 0, s12
+; GFX9-NEXT: s_mul_i32 s10, s8, s15
+; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s9, s13
-; GFX9-NEXT: s_add_i32 s14, s10, s11
-; GFX9-NEXT: s_sub_i32 s15, s3, s14
-; GFX9-NEXT: s_mul_i32 s10, s8, s13
+; GFX9-NEXT: s_mul_i32 s11, s9, s14
+; GFX9-NEXT: s_add_i32 s16, s10, s11
+; GFX9-NEXT: s_sub_i32 s12, s3, s16
+; GFX9-NEXT: s_mul_i32 s10, s8, s14
; GFX9-NEXT: s_sub_u32 s2, s2, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_subb_u32 s15, s15, s9
-; GFX9-NEXT: s_sub_u32 s16, s2, s8
-; GFX9-NEXT: s_subb_u32 s15, s15, 0
-; GFX9-NEXT: s_cmp_ge_u32 s15, s9
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GFX9-NEXT: s_subb_u32 s17, s12, s9
+; GFX9-NEXT: s_sub_u32 s18, s2, s8
+; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GFX9-NEXT: s_subb_u32 s12, s17, 0
+; GFX9-NEXT: s_cmp_ge_u32 s12, s9
+; GFX9-NEXT: s_cselect_b32 s13, -1, 0
+; GFX9-NEXT: s_cmp_ge_u32 s18, s8
; GFX9-NEXT: s_cselect_b32 s17, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s16, s8
-; GFX9-NEXT: s_cselect_b32 s16, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s15, s9
-; GFX9-NEXT: s_cselect_b32 s15, s16, s17
-; GFX9-NEXT: s_add_u32 s16, s13, 1
-; GFX9-NEXT: s_addc_u32 s17, s12, 0
-; GFX9-NEXT: s_add_u32 s18, s13, 2
-; GFX9-NEXT: s_addc_u32 s19, s12, 0
-; GFX9-NEXT: s_cmp_lg_u32 s15, 0
-; GFX9-NEXT: s_cselect_b32 s15, s18, s16
-; GFX9-NEXT: s_cselect_b32 s16, s19, s17
+; GFX9-NEXT: s_cmp_eq_u32 s12, s9
+; GFX9-NEXT: s_cselect_b32 s12, s17, s13
+; GFX9-NEXT: s_add_u32 s13, s14, 1
+; GFX9-NEXT: s_addc_u32 s17, s15, 0
+; GFX9-NEXT: s_add_u32 s18, s14, 2
+; GFX9-NEXT: s_addc_u32 s19, s15, 0
+; GFX9-NEXT: s_cmp_lg_u32 s12, 0
+; GFX9-NEXT: s_cselect_b32 s12, s18, s13
+; GFX9-NEXT: s_cselect_b32 s13, s19, s17
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s14
+; GFX9-NEXT: s_subb_u32 s3, s3, s16
; GFX9-NEXT: s_cmp_ge_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s2, s8
@@ -8056,8 +8067,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s2, s2, s10
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_cselect_b32 s3, s16, s12
-; GFX9-NEXT: s_cselect_b32 s2, s15, s13
+; GFX9-NEXT: s_cselect_b32 s3, s13, s15
+; GFX9-NEXT: s_cselect_b32 s2, s12, s14
; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_sub_u32 s2, s2, s4
@@ -8317,9 +8328,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s17, 0, s18
; GFX6-NEXT: s_add_u32 s18, s12, s13
; GFX6-NEXT: v_mov_b32_e32 v0, s18
-; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s16, s16, s17
; GFX6-NEXT: s_mul_i32 s12, s14, s16
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
@@ -8350,6 +8362,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s15, s18, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s14, s16, s14
; GFX6-NEXT: s_ashr_i32 s12, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s12
@@ -8374,53 +8387,55 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
; GFX6-NEXT: s_addc_u32 s16, s16, 0
; GFX6-NEXT: s_mul_i32 s14, s9, s14
-; GFX6-NEXT: s_add_u32 s18, s15, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NEXT: s_add_u32 s17, s15, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s17
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: s_addc_u32 s19, 0, s16
-; GFX6-NEXT: s_mul_i32 s14, s6, s19
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_mul_i32 s14, s6, s16
; GFX6-NEXT: v_readfirstlane_b32 s15, v0
; GFX6-NEXT: s_add_i32 s14, s15, s14
-; GFX6-NEXT: s_mul_i32 s15, s7, s18
-; GFX6-NEXT: s_add_i32 s20, s14, s15
-; GFX6-NEXT: s_sub_i32 s16, s9, s20
-; GFX6-NEXT: s_mul_i32 s14, s6, s18
+; GFX6-NEXT: s_mul_i32 s15, s7, s17
+; GFX6-NEXT: s_add_i32 s18, s14, s15
+; GFX6-NEXT: s_sub_i32 s19, s9, s18
+; GFX6-NEXT: s_mul_i32 s14, s6, s17
; GFX6-NEXT: s_sub_u32 s8, s8, s14
; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s17, s14, s15
-; GFX6-NEXT: s_subb_u32 s21, s16, s7
-; GFX6-NEXT: s_sub_u32 s22, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s16, s17
-; GFX6-NEXT: s_subb_u32 s16, s21, 0
-; GFX6-NEXT: s_cmp_ge_u32 s16, s7
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s22, s6
-; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s16, s7
-; GFX6-NEXT: s_cselect_b32 s16, s21, s17
-; GFX6-NEXT: s_add_u32 s17, s18, 1
-; GFX6-NEXT: s_addc_u32 s21, s19, 0
-; GFX6-NEXT: s_add_u32 s22, s18, 2
-; GFX6-NEXT: s_addc_u32 s23, s19, 0
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_cselect_b32 s16, s22, s17
-; GFX6-NEXT: s_cselect_b32 s17, s23, s21
+; GFX6-NEXT: s_or_b32 s20, s14, s15
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_subb_u32 s19, s19, s7
+; GFX6-NEXT: s_sub_u32 s21, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s9, s9, s20
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_subb_u32 s14, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s14, s7
+; GFX6-NEXT: s_cselect_b32 s15, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s21, s6
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s14, s7
+; GFX6-NEXT: s_cselect_b32 s14, s19, s15
+; GFX6-NEXT: s_add_u32 s15, s17, 1
+; GFX6-NEXT: s_addc_u32 s19, s16, 0
+; GFX6-NEXT: s_add_u32 s21, s17, 2
+; GFX6-NEXT: s_addc_u32 s22, s16, 0
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b32 s14, s21, s15
+; GFX6-NEXT: s_cselect_b32 s15, s22, s19
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_subb_u32 s9, s9, s18
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s14, -1, 0
+; GFX6-NEXT: s_cselect_b32 s18, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s14
+; GFX6-NEXT: s_cselect_b32 s6, s6, s18
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s17, s19
-; GFX6-NEXT: s_cselect_b32 s6, s16, s18
+; GFX6-NEXT: s_cselect_b32 s7, s15, s16
+; GFX6-NEXT: s_cselect_b32 s6, s14, s17
; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT: s_sub_u32 s16, s6, s2
-; GFX6-NEXT: s_subb_u32 s17, s7, s3
+; GFX6-NEXT: s_sub_u32 s14, s6, s2
+; GFX6-NEXT: s_subb_u32 s15, s7, s3
; GFX6-NEXT: s_ashr_i32 s6, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s6
; GFX6-NEXT: s_mov_b32 s7, s6
@@ -8439,39 +8454,40 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s14
+; GFX6-NEXT: s_mul_i32 s1, s12, s16
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s13, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s15, s12, s2
+; GFX6-NEXT: s_mul_i32 s17, s12, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s18, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s15, s14, s15
+; GFX6-NEXT: s_mul_i32 s17, s16, s17
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s15
+; GFX6-NEXT: s_add_u32 s4, s4, s17
; GFX6-NEXT: s_addc_u32 s4, s5, s18
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s14, s3
+; GFX6-NEXT: s_mul_i32 s3, s16, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s4, s14, s4
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_addc_u32 s4, s16, s4
; GFX6-NEXT: s_mul_i32 s2, s12, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -8485,14 +8501,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s13, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_add_u32 s13, s15, s13
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_add_u32 s13, s17, s13
+; GFX6-NEXT: v_readfirstlane_b32 s16, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
; GFX6-NEXT: v_readfirstlane_b32 s12, v3
; GFX6-NEXT: s_add_u32 s3, s13, s3
-; GFX6-NEXT: s_addc_u32 s3, s14, s12
+; GFX6-NEXT: s_addc_u32 s3, s16, s12
; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: s_addc_u32 s12, s12, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
@@ -8501,6 +8517,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
; GFX6-NEXT: s_addc_u32 s12, s4, s12
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
@@ -8512,70 +8529,72 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mov_b32_e32 v2, s13
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
; GFX6-NEXT: s_mul_i32 s2, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v3
+; GFX6-NEXT: v_readfirstlane_b32 s17, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT: s_add_u32 s2, s15, s2
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: s_add_u32 s2, s17, s2
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
; GFX6-NEXT: s_mul_i32 s13, s11, s13
-; GFX6-NEXT: v_readfirstlane_b32 s15, v1
+; GFX6-NEXT: v_readfirstlane_b32 s17, v1
; GFX6-NEXT: s_add_u32 s2, s2, s13
-; GFX6-NEXT: s_addc_u32 s2, s14, s15
+; GFX6-NEXT: s_addc_u32 s2, s16, s17
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_addc_u32 s13, s13, 0
; GFX6-NEXT: s_mul_i32 s12, s11, s12
-; GFX6-NEXT: s_add_u32 s18, s2, s12
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NEXT: s_add_u32 s16, s2, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT: s_addc_u32 s19, 0, s13
-; GFX6-NEXT: s_mul_i32 s12, s8, s19
+; GFX6-NEXT: s_addc_u32 s17, 0, s13
+; GFX6-NEXT: s_mul_i32 s12, s8, s17
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_add_i32 s12, s13, s12
-; GFX6-NEXT: s_mul_i32 s13, s9, s18
-; GFX6-NEXT: s_add_i32 s20, s12, s13
-; GFX6-NEXT: s_sub_i32 s14, s11, s20
-; GFX6-NEXT: s_mul_i32 s12, s8, s18
+; GFX6-NEXT: s_mul_i32 s13, s9, s16
+; GFX6-NEXT: s_add_i32 s18, s12, s13
+; GFX6-NEXT: s_sub_i32 s19, s11, s18
+; GFX6-NEXT: s_mul_i32 s12, s8, s16
; GFX6-NEXT: s_sub_u32 s10, s10, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s12, s13
-; GFX6-NEXT: s_subb_u32 s21, s14, s9
-; GFX6-NEXT: s_sub_u32 s22, s10, s8
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s21, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s9
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s22, s8
-; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s9
-; GFX6-NEXT: s_cselect_b32 s14, s21, s15
-; GFX6-NEXT: s_add_u32 s15, s18, 1
-; GFX6-NEXT: s_addc_u32 s21, s19, 0
-; GFX6-NEXT: s_add_u32 s22, s18, 2
-; GFX6-NEXT: s_addc_u32 s23, s19, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s22, s15
-; GFX6-NEXT: s_cselect_b32 s15, s23, s21
+; GFX6-NEXT: s_or_b32 s20, s12, s13
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_subb_u32 s19, s19, s9
+; GFX6-NEXT: s_sub_u32 s21, s10, s8
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s11, s11, s20
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s12, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s12, s9
+; GFX6-NEXT: s_cselect_b32 s13, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s21, s8
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s12, s9
+; GFX6-NEXT: s_cselect_b32 s12, s19, s13
+; GFX6-NEXT: s_add_u32 s13, s16, 1
+; GFX6-NEXT: s_addc_u32 s19, s17, 0
+; GFX6-NEXT: s_add_u32 s21, s16, 2
+; GFX6-NEXT: s_addc_u32 s22, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_cselect_b32 s12, s21, s13
+; GFX6-NEXT: s_cselect_b32 s13, s22, s19
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_subb_u32 s11, s11, s18
; GFX6-NEXT: s_cmp_ge_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s12, -1, 0
+; GFX6-NEXT: s_cselect_b32 s18, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s10, s8
; GFX6-NEXT: s_cselect_b32 s8, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s8, s8, s12
+; GFX6-NEXT: s_cselect_b32 s8, s8, s18
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_cselect_b32 s9, s15, s19
-; GFX6-NEXT: s_cselect_b32 s8, s14, s18
+; GFX6-NEXT: s_cselect_b32 s9, s13, s17
+; GFX6-NEXT: s_cselect_b32 s8, s12, s16
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5]
; GFX6-NEXT: s_sub_u32 s4, s6, s4
; GFX6-NEXT: s_subb_u32 s5, s7, s5
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mov_b32_e32 v1, s17
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v1, s15
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -8595,8 +8614,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s12, 0, s6
-; GFX9-NEXT: s_subb_u32 s13, 0, s7
+; GFX9-NEXT: s_sub_u32 s14, 0, s6
+; GFX9-NEXT: s_subb_u32 s15, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8605,52 +8624,56 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s14, v1
-; GFX9-NEXT: v_readfirstlane_b32 s15, v0
-; GFX9-NEXT: s_mul_i32 s16, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15
-; GFX9-NEXT: s_mul_i32 s17, s13, s15
-; GFX9-NEXT: s_add_i32 s16, s18, s16
-; GFX9-NEXT: s_mul_i32 s19, s12, s15
-; GFX9-NEXT: s_add_i32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19
-; GFX9-NEXT: s_mul_i32 s20, s15, s16
-; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16
+; GFX9-NEXT: v_readfirstlane_b32 s16, v1
+; GFX9-NEXT: v_readfirstlane_b32 s12, v0
+; GFX9-NEXT: s_mul_i32 s13, s14, s16
+; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12
+; GFX9-NEXT: s_mul_i32 s17, s15, s12
+; GFX9-NEXT: s_add_i32 s13, s18, s13
+; GFX9-NEXT: s_mul_i32 s19, s14, s12
+; GFX9-NEXT: s_add_i32 s13, s13, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19
+; GFX9-NEXT: s_mul_i32 s20, s12, s13
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13
; GFX9-NEXT: s_add_u32 s18, s18, s20
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19
-; GFX9-NEXT: s_mul_i32 s19, s14, s19
+; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19
+; GFX9-NEXT: s_mul_i32 s19, s16, s19
; GFX9-NEXT: s_add_u32 s18, s18, s19
-; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16
+; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13
; GFX9-NEXT: s_addc_u32 s17, s17, s20
; GFX9-NEXT: s_addc_u32 s18, s21, 0
-; GFX9-NEXT: s_mul_i32 s16, s14, s16
-; GFX9-NEXT: s_add_u32 s16, s17, s16
+; GFX9-NEXT: s_mul_i32 s13, s16, s13
+; GFX9-NEXT: s_add_u32 s13, s17, s13
; GFX9-NEXT: s_addc_u32 s17, 0, s18
-; GFX9-NEXT: s_add_u32 s15, s15, s16
-; GFX9-NEXT: s_addc_u32 s14, s14, s17
-; GFX9-NEXT: s_mul_i32 s16, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT: s_add_i32 s16, s17, s16
-; GFX9-NEXT: s_mul_i32 s13, s13, s15
-; GFX9-NEXT: s_add_i32 s16, s16, s13
-; GFX9-NEXT: s_mul_i32 s12, s12, s15
-; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12
-; GFX9-NEXT: s_mul_i32 s18, s14, s12
-; GFX9-NEXT: s_mul_i32 s20, s15, s16
-; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12
-; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16
-; GFX9-NEXT: s_add_u32 s12, s12, s20
+; GFX9-NEXT: s_add_u32 s18, s12, s13
+; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GFX9-NEXT: s_addc_u32 s16, s16, s17
+; GFX9-NEXT: s_mul_i32 s12, s14, s16
+; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18
+; GFX9-NEXT: s_add_i32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s15, s15, s18
+; GFX9-NEXT: s_add_i32 s12, s12, s15
+; GFX9-NEXT: s_mul_i32 s14, s14, s18
+; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14
+; GFX9-NEXT: s_mul_i32 s17, s16, s14
+; GFX9-NEXT: s_mul_i32 s20, s18, s12
+; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14
+; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12
+; GFX9-NEXT: s_add_u32 s14, s14, s20
; GFX9-NEXT: s_addc_u32 s19, 0, s19
-; GFX9-NEXT: s_add_u32 s12, s12, s18
-; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16
-; GFX9-NEXT: s_addc_u32 s12, s19, s17
+; GFX9-NEXT: s_add_u32 s14, s14, s17
+; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12
+; GFX9-NEXT: s_addc_u32 s14, s19, s15
; GFX9-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NEXT: s_mul_i32 s16, s14, s16
-; GFX9-NEXT: s_add_u32 s12, s12, s16
-; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_add_u32 s15, s15, s12
-; GFX9-NEXT: s_addc_u32 s14, s14, s13
+; GFX9-NEXT: s_mul_i32 s12, s16, s12
+; GFX9-NEXT: s_add_u32 s12, s14, s12
+; GFX9-NEXT: s_addc_u32 s14, 0, s13
+; GFX9-NEXT: s_add_u32 s15, s18, s12
+; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GFX9-NEXT: s_addc_u32 s14, s16, s14
; GFX9-NEXT: s_ashr_i32 s12, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s12
; GFX9-NEXT: s_mov_b32 s13, s12
@@ -8668,35 +8691,38 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_addc_u32 s15, s16, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
; GFX9-NEXT: s_mul_i32 s14, s9, s14
-; GFX9-NEXT: s_add_u32 s17, s15, s14
-; GFX9-NEXT: s_addc_u32 s16, 0, s16
-; GFX9-NEXT: s_mul_i32 s14, s6, s16
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17
+; GFX9-NEXT: s_add_u32 s18, s15, s14
+; GFX9-NEXT: s_addc_u32 s19, 0, s16
+; GFX9-NEXT: s_mul_i32 s14, s6, s19
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18
; GFX9-NEXT: s_add_i32 s14, s15, s14
-; GFX9-NEXT: s_mul_i32 s15, s7, s17
-; GFX9-NEXT: s_add_i32 s18, s14, s15
-; GFX9-NEXT: s_sub_i32 s19, s9, s18
-; GFX9-NEXT: s_mul_i32 s14, s6, s17
+; GFX9-NEXT: s_mul_i32 s15, s7, s18
+; GFX9-NEXT: s_add_i32 s20, s14, s15
+; GFX9-NEXT: s_sub_i32 s16, s9, s20
+; GFX9-NEXT: s_mul_i32 s14, s6, s18
; GFX9-NEXT: s_sub_u32 s8, s8, s14
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_subb_u32 s19, s19, s7
-; GFX9-NEXT: s_sub_u32 s20, s8, s6
-; GFX9-NEXT: s_subb_u32 s19, s19, 0
-; GFX9-NEXT: s_cmp_ge_u32 s19, s7
+; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
+; GFX9-NEXT: s_subb_u32 s21, s16, s7
+; GFX9-NEXT: s_sub_u32 s22, s8, s6
+; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GFX9-NEXT: s_subb_u32 s16, s21, 0
+; GFX9-NEXT: s_cmp_ge_u32 s16, s7
+; GFX9-NEXT: s_cselect_b32 s17, -1, 0
+; GFX9-NEXT: s_cmp_ge_u32 s22, s6
; GFX9-NEXT: s_cselect_b32 s21, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s20, s6
-; GFX9-NEXT: s_cselect_b32 s20, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s19, s7
-; GFX9-NEXT: s_cselect_b32 s19, s20, s21
-; GFX9-NEXT: s_add_u32 s20, s17, 1
-; GFX9-NEXT: s_addc_u32 s21, s16, 0
-; GFX9-NEXT: s_add_u32 s22, s17, 2
-; GFX9-NEXT: s_addc_u32 s23, s16, 0
-; GFX9-NEXT: s_cmp_lg_u32 s19, 0
-; GFX9-NEXT: s_cselect_b32 s19, s22, s20
-; GFX9-NEXT: s_cselect_b32 s20, s23, s21
+; GFX9-NEXT: s_cmp_eq_u32 s16, s7
+; GFX9-NEXT: s_cselect_b32 s16, s21, s17
+; GFX9-NEXT: s_add_u32 s17, s18, 1
+; GFX9-NEXT: s_addc_u32 s21, s19, 0
+; GFX9-NEXT: s_add_u32 s22, s18, 2
+; GFX9-NEXT: s_addc_u32 s23, s19, 0
+; GFX9-NEXT: s_cmp_lg_u32 s16, 0
+; GFX9-NEXT: s_cselect_b32 s16, s22, s17
+; GFX9-NEXT: s_cselect_b32 s17, s23, s21
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s18
+; GFX9-NEXT: s_subb_u32 s9, s9, s20
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s14, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8704,12 +8730,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s14
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s20, s16
-; GFX9-NEXT: s_cselect_b32 s6, s19, s17
+; GFX9-NEXT: s_cselect_b32 s7, s17, s19
+; GFX9-NEXT: s_cselect_b32 s6, s16, s18
; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX9-NEXT: s_sub_u32 s12, s6, s2
-; GFX9-NEXT: s_subb_u32 s13, s7, s3
+; GFX9-NEXT: s_sub_u32 s14, s6, s2
+; GFX9-NEXT: s_subb_u32 s15, s7, s3
; GFX9-NEXT: s_ashr_i32 s2, s1, 31
; GFX9-NEXT: s_add_u32 s0, s0, s2
; GFX9-NEXT: s_mov_b32 s3, s2
@@ -8718,8 +8744,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s4, 0, s6
-; GFX9-NEXT: s_subb_u32 s5, 0, s7
+; GFX9-NEXT: s_sub_u32 s8, 0, s6
+; GFX9-NEXT: s_subb_u32 s9, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -8729,98 +8755,105 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s8, v1
-; GFX9-NEXT: v_readfirstlane_b32 s15, v2
-; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8
-; GFX9-NEXT: s_mul_i32 s16, s4, s15
-; GFX9-NEXT: s_mul_i32 s9, s5, s8
-; GFX9-NEXT: s_add_i32 s14, s14, s16
-; GFX9-NEXT: s_add_i32 s14, s14, s9
-; GFX9-NEXT: s_mul_i32 s17, s4, s8
-; GFX9-NEXT: s_mul_i32 s16, s8, s14
-; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17
-; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_readfirstlane_b32 s13, v2
+; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
+; GFX9-NEXT: s_mul_i32 s16, s8, s13
+; GFX9-NEXT: s_mul_i32 s5, s9, s4
+; GFX9-NEXT: s_add_i32 s12, s12, s16
+; GFX9-NEXT: s_add_i32 s12, s12, s5
+; GFX9-NEXT: s_mul_i32 s17, s8, s4
+; GFX9-NEXT: s_mul_i32 s16, s4, s12
+; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17
+; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12
; GFX9-NEXT: s_add_u32 s16, s18, s16
-; GFX9-NEXT: s_addc_u32 s9, 0, s9
-; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17
-; GFX9-NEXT: s_mul_i32 s17, s15, s17
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17
+; GFX9-NEXT: s_mul_i32 s17, s13, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14
-; GFX9-NEXT: s_addc_u32 s9, s9, s19
+; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12
+; GFX9-NEXT: s_addc_u32 s5, s5, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
-; GFX9-NEXT: s_mul_i32 s14, s15, s14
-; GFX9-NEXT: s_add_u32 s9, s9, s14
-; GFX9-NEXT: s_addc_u32 s14, 0, s16
-; GFX9-NEXT: s_add_u32 s8, s8, s9
-; GFX9-NEXT: s_addc_u32 s9, s15, s14
-; GFX9-NEXT: s_mul_i32 s14, s4, s9
-; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8
-; GFX9-NEXT: s_add_i32 s14, s15, s14
-; GFX9-NEXT: s_mul_i32 s5, s5, s8
-; GFX9-NEXT: s_add_i32 s14, s14, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4
-; GFX9-NEXT: s_mul_i32 s16, s9, s4
-; GFX9-NEXT: s_mul_i32 s18, s8, s14
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14
-; GFX9-NEXT: s_add_u32 s4, s4, s18
+; GFX9-NEXT: s_mul_i32 s12, s13, s12
+; GFX9-NEXT: s_add_u32 s5, s5, s12
+; GFX9-NEXT: s_addc_u32 s12, 0, s16
+; GFX9-NEXT: s_add_u32 s16, s4, s5
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s4, s8, s12
+; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16
+; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s9, s9, s16
+; GFX9-NEXT: s_add_i32 s4, s4, s9
+; GFX9-NEXT: s_mul_i32 s8, s8, s16
+; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
+; GFX9-NEXT: s_mul_i32 s13, s12, s8
+; GFX9-NEXT: s_mul_i32 s18, s16, s4
+; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8
+; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4
+; GFX9-NEXT: s_add_u32 s8, s8, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s4, s4, s16
-; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14
-; GFX9-NEXT: s_addc_u32 s4, s17, s15
+; GFX9-NEXT: s_add_u32 s8, s8, s13
+; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
+; GFX9-NEXT: s_addc_u32 s8, s17, s9
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s14, s9, s14
-; GFX9-NEXT: s_add_u32 s4, s4, s14
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_u32 s14, s8, s4
-; GFX9-NEXT: s_addc_u32 s15, s9, s5
+; GFX9-NEXT: s_mul_i32 s4, s12, s4
+; GFX9-NEXT: s_add_u32 s4, s8, s4
+; GFX9-NEXT: s_addc_u32 s8, 0, s5
+; GFX9-NEXT: s_add_u32 s13, s16, s4
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s12, s12, s8
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s8, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s9, s11, s4
; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s8, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14
-; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15
+; GFX9-NEXT: s_mul_i32 s11, s8, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13
+; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX9-NEXT: s_add_u32 s11, s16, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14
-; GFX9-NEXT: s_mul_i32 s14, s9, s14
-; GFX9-NEXT: s_add_u32 s11, s11, s14
-; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13
+; GFX9-NEXT: s_mul_i32 s13, s9, s13
+; GFX9-NEXT: s_add_u32 s11, s11, s13
+; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12
; GFX9-NEXT: s_addc_u32 s10, s10, s17
; GFX9-NEXT: s_addc_u32 s11, s16, 0
-; GFX9-NEXT: s_mul_i32 s14, s9, s15
-; GFX9-NEXT: s_add_u32 s14, s10, s14
-; GFX9-NEXT: s_addc_u32 s15, 0, s11
-; GFX9-NEXT: s_mul_i32 s10, s6, s15
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14
+; GFX9-NEXT: s_mul_i32 s12, s9, s12
+; GFX9-NEXT: s_add_u32 s16, s10, s12
+; GFX9-NEXT: s_addc_u32 s17, 0, s11
+; GFX9-NEXT: s_mul_i32 s10, s6, s17
+; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s7, s14
-; GFX9-NEXT: s_add_i32 s16, s10, s11
-; GFX9-NEXT: s_sub_i32 s17, s9, s16
-; GFX9-NEXT: s_mul_i32 s10, s6, s14
+; GFX9-NEXT: s_mul_i32 s11, s7, s16
+; GFX9-NEXT: s_add_i32 s18, s10, s11
+; GFX9-NEXT: s_sub_i32 s12, s9, s18
+; GFX9-NEXT: s_mul_i32 s10, s6, s16
; GFX9-NEXT: s_sub_u32 s8, s8, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_subb_u32 s17, s17, s7
-; GFX9-NEXT: s_sub_u32 s18, s8, s6
-; GFX9-NEXT: s_subb_u32 s17, s17, 0
-; GFX9-NEXT: s_cmp_ge_u32 s17, s7
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GFX9-NEXT: s_subb_u32 s19, s12, s7
+; GFX9-NEXT: s_sub_u32 s20, s8, s6
+; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GFX9-NEXT: s_subb_u32 s12, s19, 0
+; GFX9-NEXT: s_cmp_ge_u32 s12, s7
+; GFX9-NEXT: s_cselect_b32 s13, -1, 0
+; GFX9-NEXT: s_cmp_ge_u32 s20, s6
; GFX9-NEXT: s_cselect_b32 s19, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s18, s6
-; GFX9-NEXT: s_cselect_b32 s18, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s17, s7
-; GFX9-NEXT: s_cselect_b32 s17, s18, s19
-; GFX9-NEXT: s_add_u32 s18, s14, 1
-; GFX9-NEXT: s_addc_u32 s19, s15, 0
-; GFX9-NEXT: s_add_u32 s20, s14, 2
-; GFX9-NEXT: s_addc_u32 s21, s15, 0
-; GFX9-NEXT: s_cmp_lg_u32 s17, 0
-; GFX9-NEXT: s_cselect_b32 s17, s20, s18
-; GFX9-NEXT: s_cselect_b32 s18, s21, s19
+; GFX9-NEXT: s_cmp_eq_u32 s12, s7
+; GFX9-NEXT: s_cselect_b32 s12, s19, s13
+; GFX9-NEXT: s_add_u32 s13, s16, 1
+; GFX9-NEXT: s_addc_u32 s19, s17, 0
+; GFX9-NEXT: s_add_u32 s20, s16, 2
+; GFX9-NEXT: s_addc_u32 s21, s17, 0
+; GFX9-NEXT: s_cmp_lg_u32 s12, 0
+; GFX9-NEXT: s_cselect_b32 s12, s20, s13
+; GFX9-NEXT: s_cselect_b32 s13, s21, s19
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s16
+; GFX9-NEXT: s_subb_u32 s9, s9, s18
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8828,14 +8861,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s10
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s18, s15
-; GFX9-NEXT: s_cselect_b32 s6, s17, s14
+; GFX9-NEXT: s_cselect_b32 s7, s13, s17
+; GFX9-NEXT: s_cselect_b32 s6, s12, s16
; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3]
; GFX9-NEXT: s_sub_u32 s2, s4, s2
; GFX9-NEXT: s_subb_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mov_b32_e32 v1, s14
+; GFX9-NEXT: v_mov_b32_e32 v2, s15
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -9056,9 +9089,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s13, 0, s14
; GFX6-NEXT: s_add_u32 s14, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s13
; GFX6-NEXT: s_mul_i32 s0, s10, s12
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -9089,6 +9123,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s13, s14, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s10
; GFX6-NEXT: s_ashr_i32 s10, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s10
@@ -9123,43 +9158,46 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
; GFX6-NEXT: s_mul_i32 s5, s9, s12
-; GFX6-NEXT: s_add_i32 s14, s4, s5
-; GFX6-NEXT: s_sub_i32 s13, s7, s14
+; GFX6-NEXT: s_add_i32 s13, s4, s5
+; GFX6-NEXT: s_sub_i32 s14, s7, s13
; GFX6-NEXT: s_mul_i32 s4, s8, s12
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX6-NEXT: s_or_b32 s12, s4, s5
-; GFX6-NEXT: s_subb_u32 s15, s13, s9
-; GFX6-NEXT: s_sub_u32 s16, s6, s8
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s17, s12, s13
-; GFX6-NEXT: s_subb_u32 s17, s15, 0
-; GFX6-NEXT: s_cmp_ge_u32 s17, s9
-; GFX6-NEXT: s_cselect_b32 s18, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s16, s8
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s17, s9
-; GFX6-NEXT: s_cselect_b32 s18, s19, s18
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s15, s15, s9
-; GFX6-NEXT: s_sub_u32 s19, s16, s8
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s12, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_cselect_b32 s13, s19, s16
-; GFX6-NEXT: s_cselect_b32 s12, s12, s17
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s14, s14, s9
+; GFX6-NEXT: s_sub_u32 s15, s6, s8
+; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_subb_u32 s4, s7, s14
-; GFX6-NEXT: s_cmp_ge_u32 s4, s9
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_subb_u32 s16, s14, 0
+; GFX6-NEXT: s_cmp_ge_u32 s16, s9
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s6, s8
+; GFX6-NEXT: s_cmp_ge_u32 s15, s8
+; GFX6-NEXT: s_cselect_b32 s17, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s16, s9
+; GFX6-NEXT: s_cselect_b32 s17, s17, s5
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_subb_u32 s14, s14, s9
+; GFX6-NEXT: s_sub_u32 s18, s15, s8
+; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT: s_or_b32 s4, s4, s5
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_subb_u32 s4, s14, 0
+; GFX6-NEXT: s_cmp_lg_u32 s17, 0
+; GFX6-NEXT: s_cselect_b32 s14, s18, s15
+; GFX6-NEXT: s_cselect_b32 s4, s4, s16
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s5, s7, s13
+; GFX6-NEXT: s_cmp_ge_u32 s5, s9
; GFX6-NEXT: s_cselect_b32 s7, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s4, s9
-; GFX6-NEXT: s_cselect_b32 s5, s7, s5
-; GFX6-NEXT: s_cmp_lg_u32 s5, 0
-; GFX6-NEXT: s_cselect_b32 s5, s12, s4
-; GFX6-NEXT: s_cselect_b32 s4, s13, s6
+; GFX6-NEXT: s_cmp_ge_u32 s6, s8
+; GFX6-NEXT: s_cselect_b32 s8, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s5, s9
+; GFX6-NEXT: s_cselect_b32 s7, s8, s7
+; GFX6-NEXT: s_cmp_lg_u32 s7, 0
+; GFX6-NEXT: s_cselect_b32 s5, s4, s5
+; GFX6-NEXT: s_cselect_b32 s4, s14, s6
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
; GFX6-NEXT: s_sub_u32 s4, s4, s10
; GFX6-NEXT: s_subb_u32 s5, s5, s10
@@ -9181,8 +9219,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s4, 0, s6
-; GFX9-NEXT: s_subb_u32 s5, 0, s7
+; GFX9-NEXT: s_sub_u32 s8, 0, s6
+; GFX9-NEXT: s_subb_u32 s9, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9192,52 +9230,56 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v2
-; GFX9-NEXT: v_readfirstlane_b32 s9, v1
-; GFX9-NEXT: s_mul_i32 s10, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9
-; GFX9-NEXT: s_mul_i32 s11, s5, s9
-; GFX9-NEXT: s_add_i32 s10, s12, s10
-; GFX9-NEXT: s_mul_i32 s13, s4, s9
-; GFX9-NEXT: s_add_i32 s10, s10, s11
-; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13
-; GFX9-NEXT: s_mul_i32 s14, s9, s10
-; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10
+; GFX9-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mul_i32 s5, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
+; GFX9-NEXT: s_mul_i32 s11, s9, s4
+; GFX9-NEXT: s_add_i32 s5, s12, s5
+; GFX9-NEXT: s_mul_i32 s13, s8, s4
+; GFX9-NEXT: s_add_i32 s5, s5, s11
+; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13
+; GFX9-NEXT: s_mul_i32 s14, s4, s5
+; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5
; GFX9-NEXT: s_add_u32 s12, s12, s14
; GFX9-NEXT: s_addc_u32 s11, 0, s11
-; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13
-; GFX9-NEXT: s_mul_i32 s13, s8, s13
+; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13
+; GFX9-NEXT: s_mul_i32 s13, s10, s13
; GFX9-NEXT: s_add_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5
; GFX9-NEXT: s_addc_u32 s11, s11, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
-; GFX9-NEXT: s_mul_i32 s10, s8, s10
-; GFX9-NEXT: s_add_u32 s10, s11, s10
+; GFX9-NEXT: s_mul_i32 s5, s10, s5
+; GFX9-NEXT: s_add_u32 s5, s11, s5
; GFX9-NEXT: s_addc_u32 s11, 0, s12
-; GFX9-NEXT: s_add_u32 s9, s9, s10
-; GFX9-NEXT: s_addc_u32 s8, s8, s11
-; GFX9-NEXT: s_mul_i32 s10, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9
-; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s5, s5, s9
-; GFX9-NEXT: s_add_i32 s10, s10, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s9
-; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4
-; GFX9-NEXT: s_mul_i32 s12, s8, s4
-; GFX9-NEXT: s_mul_i32 s14, s9, s10
-; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4
-; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10
-; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_add_u32 s12, s4, s5
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s10, s10, s11
+; GFX9-NEXT: s_mul_i32 s4, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12
+; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s9, s9, s12
+; GFX9-NEXT: s_add_i32 s4, s4, s9
+; GFX9-NEXT: s_mul_i32 s8, s8, s12
+; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8
+; GFX9-NEXT: s_mul_i32 s11, s10, s8
+; GFX9-NEXT: s_mul_i32 s14, s12, s4
+; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8
+; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4
+; GFX9-NEXT: s_add_u32 s8, s8, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_add_u32 s4, s4, s12
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10
-; GFX9-NEXT: s_addc_u32 s4, s13, s11
+; GFX9-NEXT: s_add_u32 s8, s8, s11
+; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4
+; GFX9-NEXT: s_addc_u32 s8, s13, s9
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s10, s8, s10
-; GFX9-NEXT: s_add_u32 s4, s4, s10
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_u32 s9, s9, s4
-; GFX9-NEXT: s_addc_u32 s8, s8, s5
+; GFX9-NEXT: s_mul_i32 s4, s10, s4
+; GFX9-NEXT: s_add_u32 s4, s8, s4
+; GFX9-NEXT: s_addc_u32 s8, 0, s5
+; GFX9-NEXT: s_add_u32 s9, s12, s4
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s8, s10, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -9267,9 +9309,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_mul_i32 s8, s6, s8
; GFX9-NEXT: s_sub_u32 s2, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s13, s10, s7
; GFX9-NEXT: s_sub_u32 s14, s2, s6
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s15, s13, 0
; GFX9-NEXT: s_cmp_ge_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
@@ -9278,11 +9322,13 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, s17, s16
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s13, s7
-; GFX9-NEXT: s_sub_u32 s11, s14, s6
-; GFX9-NEXT: s_subb_u32 s10, s10, 0
+; GFX9-NEXT: s_subb_u32 s13, s13, s7
+; GFX9-NEXT: s_sub_u32 s17, s14, s6
+; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GFX9-NEXT: s_subb_u32 s10, s13, 0
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b32 s11, s11, s14
+; GFX9-NEXT: s_cselect_b32 s11, s17, s14
; GFX9-NEXT: s_cselect_b32 s10, s10, s15
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s3, s3, s12
@@ -9444,9 +9490,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s6, s7
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
; GFX6-NEXT: s_or_b32 s6, s6, s7
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s6, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s7, v0
@@ -9477,6 +9524,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s16, s6
; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: s_or_b32 s6, s6, s7
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s12, s14, s12
; GFX6-NEXT: s_ashr_i32 s6, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s6
@@ -9509,46 +9557,49 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_add_i32 s13, s14, s13
; GFX6-NEXT: s_mul_i32 s14, s3, s12
-; GFX6-NEXT: s_add_i32 s16, s13, s14
-; GFX6-NEXT: s_sub_i32 s14, s9, s16
+; GFX6-NEXT: s_add_i32 s14, s13, s14
+; GFX6-NEXT: s_sub_i32 s15, s9, s14
; GFX6-NEXT: s_mul_i32 s12, s2, s12
; GFX6-NEXT: s_sub_u32 s8, s8, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s12, s13
-; GFX6-NEXT: s_subb_u32 s17, s14, s3
-; GFX6-NEXT: s_sub_u32 s18, s8, s2
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s19, s14, s15
-; GFX6-NEXT: s_subb_u32 s19, s17, 0
-; GFX6-NEXT: s_cmp_ge_u32 s19, s3
-; GFX6-NEXT: s_cselect_b32 s20, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s2
-; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s19, s3
-; GFX6-NEXT: s_cselect_b32 s20, s21, s20
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s17, s17, s3
-; GFX6-NEXT: s_sub_u32 s21, s18, s2
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b32 s15, s21, s18
-; GFX6-NEXT: s_cselect_b32 s14, s14, s19
+; GFX6-NEXT: s_or_b32 s16, s12, s13
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_subb_u32 s15, s15, s3
+; GFX6-NEXT: s_sub_u32 s17, s8, s2
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s18, s15, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s3
+; GFX6-NEXT: s_cselect_b32 s13, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s17, s2
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s18, s3
+; GFX6-NEXT: s_cselect_b32 s19, s19, s13
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s15, s15, s3
+; GFX6-NEXT: s_sub_u32 s20, s17, s2
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s9, s9, s16
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_subb_u32 s12, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s19, 0
+; GFX6-NEXT: s_cselect_b32 s13, s20, s17
+; GFX6-NEXT: s_cselect_b32 s12, s12, s18
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_subb_u32 s9, s9, s14
; GFX6-NEXT: s_cmp_ge_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s12, -1, 0
+; GFX6-NEXT: s_cselect_b32 s14, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s2
; GFX6-NEXT: s_cselect_b32 s2, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s2, s2, s12
+; GFX6-NEXT: s_cselect_b32 s2, s2, s14
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_cselect_b32 s3, s14, s9
-; GFX6-NEXT: s_cselect_b32 s2, s15, s8
+; GFX6-NEXT: s_cselect_b32 s3, s12, s9
+; GFX6-NEXT: s_cselect_b32 s2, s13, s8
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT: s_sub_u32 s14, s2, s6
-; GFX6-NEXT: s_subb_u32 s15, s3, s6
+; GFX6-NEXT: s_sub_u32 s12, s2, s6
+; GFX6-NEXT: s_subb_u32 s13, s3, s6
; GFX6-NEXT: s_ashr_i32 s2, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s2
; GFX6-NEXT: s_mov_b32 s3, s2
@@ -9567,39 +9618,40 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s8, s12
+; GFX6-NEXT: s_mul_i32 s1, s8, s14
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s9, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s13, s8, s2
+; GFX6-NEXT: s_mul_i32 s15, s8, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s16, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s13, s12, s13
+; GFX6-NEXT: s_mul_i32 s15, s14, s15
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s13
+; GFX6-NEXT: s_add_u32 s4, s4, s15
; GFX6-NEXT: s_addc_u32 s4, s5, s16
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s12, s3
+; GFX6-NEXT: s_mul_i32 s3, s14, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s4, s12, s4
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_addc_u32 s4, s14, s4
; GFX6-NEXT: s_mul_i32 s2, s8, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -9613,98 +9665,102 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s9, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s13, v2
-; GFX6-NEXT: s_add_u32 s9, s13, s9
-; GFX6-NEXT: v_readfirstlane_b32 s12, v0
+; GFX6-NEXT: v_readfirstlane_b32 s15, v2
+; GFX6-NEXT: s_add_u32 s9, s15, s9
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s12, 0, s12
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
; GFX6-NEXT: v_readfirstlane_b32 s8, v3
; GFX6-NEXT: s_add_u32 s3, s9, s3
-; GFX6-NEXT: s_addc_u32 s3, s12, s8
+; GFX6-NEXT: s_addc_u32 s3, s14, s8
; GFX6-NEXT: v_readfirstlane_b32 s8, v1
; GFX6-NEXT: s_addc_u32 s8, s8, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
; GFX6-NEXT: s_add_u32 s2, s3, s2
; GFX6-NEXT: s_addc_u32 s8, 0, s8
-; GFX6-NEXT: s_add_u32 s12, s5, s2
+; GFX6-NEXT: s_add_u32 s14, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s13, s4, s8
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_addc_u32 s15, s4, s8
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_addc_u32 s3, s11, s4
; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s15
; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s12
+; GFX6-NEXT: v_mov_b32_e32 v2, s14
; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2
-; GFX6-NEXT: s_mul_i32 s2, s8, s13
+; GFX6-NEXT: s_mul_i32 s2, s8, s15
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2
; GFX6-NEXT: v_readfirstlane_b32 s11, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX6-NEXT: s_add_u32 s2, s11, s2
; GFX6-NEXT: s_addc_u32 s10, 0, s10
-; GFX6-NEXT: s_mul_i32 s11, s9, s12
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: s_mul_i32 s11, s9, s14
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: s_add_u32 s2, s2, s11
-; GFX6-NEXT: s_addc_u32 s2, s10, s12
+; GFX6-NEXT: s_addc_u32 s2, s10, s14
; GFX6-NEXT: v_readfirstlane_b32 s10, v0
; GFX6-NEXT: s_addc_u32 s10, s10, 0
-; GFX6-NEXT: s_mul_i32 s11, s9, s13
+; GFX6-NEXT: s_mul_i32 s11, s9, s15
; GFX6-NEXT: s_add_u32 s11, s2, s11
; GFX6-NEXT: v_mov_b32_e32 v0, s11
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX6-NEXT: s_addc_u32 s10, 0, s10
; GFX6-NEXT: s_mul_i32 s10, s6, s10
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: v_readfirstlane_b32 s12, v0
-; GFX6-NEXT: s_add_i32 s10, s12, s10
-; GFX6-NEXT: s_mul_i32 s12, s7, s11
-; GFX6-NEXT: s_add_i32 s16, s10, s12
-; GFX6-NEXT: s_sub_i32 s12, s9, s16
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: s_add_i32 s10, s14, s10
+; GFX6-NEXT: s_mul_i32 s14, s7, s11
+; GFX6-NEXT: s_add_i32 s14, s10, s14
+; GFX6-NEXT: s_sub_i32 s15, s9, s14
; GFX6-NEXT: s_mul_i32 s10, s6, s11
; GFX6-NEXT: s_sub_u32 s8, s8, s10
; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s13, s10, s11
-; GFX6-NEXT: s_subb_u32 s17, s12, s7
-; GFX6-NEXT: s_sub_u32 s18, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s19, s12, s13
-; GFX6-NEXT: s_subb_u32 s19, s17, 0
-; GFX6-NEXT: s_cmp_ge_u32 s19, s7
-; GFX6-NEXT: s_cselect_b32 s20, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s6
-; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s19, s7
-; GFX6-NEXT: s_cselect_b32 s20, s21, s20
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s17, s17, s7
-; GFX6-NEXT: s_sub_u32 s21, s18, s6
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s12, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b32 s13, s21, s18
-; GFX6-NEXT: s_cselect_b32 s12, s12, s19
+; GFX6-NEXT: s_or_b32 s16, s10, s11
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_subb_u32 s15, s15, s7
+; GFX6-NEXT: s_sub_u32 s17, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
; GFX6-NEXT: s_or_b32 s10, s10, s11
-; GFX6-NEXT: s_subb_u32 s9, s9, s16
+; GFX6-NEXT: s_cmp_lg_u32 s10, 0
+; GFX6-NEXT: s_subb_u32 s18, s15, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s7
+; GFX6-NEXT: s_cselect_b32 s11, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s17, s6
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s18, s7
+; GFX6-NEXT: s_cselect_b32 s19, s19, s11
+; GFX6-NEXT: s_cmp_lg_u32 s10, 0
+; GFX6-NEXT: s_subb_u32 s15, s15, s7
+; GFX6-NEXT: s_sub_u32 s20, s17, s6
+; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX6-NEXT: s_or_b32 s10, s10, s11
+; GFX6-NEXT: s_cmp_lg_u32 s10, 0
+; GFX6-NEXT: s_subb_u32 s10, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s19, 0
+; GFX6-NEXT: s_cselect_b32 s11, s20, s17
+; GFX6-NEXT: s_cselect_b32 s10, s10, s18
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_subb_u32 s9, s9, s14
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s10, -1, 0
+; GFX6-NEXT: s_cselect_b32 s14, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s10
+; GFX6-NEXT: s_cselect_b32 s6, s6, s14
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s12, s9
-; GFX6-NEXT: s_cselect_b32 s6, s13, s8
+; GFX6-NEXT: s_cselect_b32 s7, s10, s9
+; GFX6-NEXT: s_cselect_b32 s6, s11, s8
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_u32 s5, s6, s4
; GFX6-NEXT: s_subb_u32 s4, s7, s4
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
+; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mov_b32_e32 v1, s13
; GFX6-NEXT: v_mov_b32_e32 v2, s5
; GFX6-NEXT: v_mov_b32_e32 v3, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9724,8 +9780,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT: s_sub_u32 s6, 0, s2
-; GFX9-NEXT: s_subb_u32 s7, 0, s3
+; GFX9-NEXT: s_sub_u32 s12, 0, s2
+; GFX9-NEXT: s_subb_u32 s13, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9734,52 +9790,56 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: v_readfirstlane_b32 s13, v0
-; GFX9-NEXT: s_mul_i32 s14, s6, s12
-; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13
-; GFX9-NEXT: s_mul_i32 s15, s7, s13
-; GFX9-NEXT: s_add_i32 s14, s16, s14
-; GFX9-NEXT: s_mul_i32 s17, s6, s13
-; GFX9-NEXT: s_add_i32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17
-; GFX9-NEXT: s_mul_i32 s18, s13, s14
-; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14
+; GFX9-NEXT: v_readfirstlane_b32 s14, v1
+; GFX9-NEXT: v_readfirstlane_b32 s6, v0
+; GFX9-NEXT: s_mul_i32 s7, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6
+; GFX9-NEXT: s_mul_i32 s15, s13, s6
+; GFX9-NEXT: s_add_i32 s7, s16, s7
+; GFX9-NEXT: s_mul_i32 s17, s12, s6
+; GFX9-NEXT: s_add_i32 s7, s7, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17
+; GFX9-NEXT: s_mul_i32 s18, s6, s7
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7
; GFX9-NEXT: s_add_u32 s16, s16, s18
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17
-; GFX9-NEXT: s_mul_i32 s17, s12, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17
+; GFX9-NEXT: s_mul_i32 s17, s14, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7
; GFX9-NEXT: s_addc_u32 s15, s15, s18
; GFX9-NEXT: s_addc_u32 s16, s19, 0
-; GFX9-NEXT: s_mul_i32 s14, s12, s14
-; GFX9-NEXT: s_add_u32 s14, s15, s14
+; GFX9-NEXT: s_mul_i32 s7, s14, s7
+; GFX9-NEXT: s_add_u32 s7, s15, s7
; GFX9-NEXT: s_addc_u32 s15, 0, s16
-; GFX9-NEXT: s_add_u32 s13, s13, s14
-; GFX9-NEXT: s_addc_u32 s12, s12, s15
-; GFX9-NEXT: s_mul_i32 s14, s6, s12
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13
-; GFX9-NEXT: s_add_i32 s14, s15, s14
-; GFX9-NEXT: s_mul_i32 s7, s7, s13
-; GFX9-NEXT: s_add_i32 s14, s14, s7
-; GFX9-NEXT: s_mul_i32 s6, s6, s13
-; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6
-; GFX9-NEXT: s_mul_i32 s16, s12, s6
-; GFX9-NEXT: s_mul_i32 s18, s13, s14
-; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6
-; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14
-; GFX9-NEXT: s_add_u32 s6, s6, s18
+; GFX9-NEXT: s_add_u32 s16, s6, s7
+; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-NEXT: s_addc_u32 s14, s14, s15
+; GFX9-NEXT: s_mul_i32 s6, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16
+; GFX9-NEXT: s_add_i32 s6, s7, s6
+; GFX9-NEXT: s_mul_i32 s13, s13, s16
+; GFX9-NEXT: s_add_i32 s6, s6, s13
+; GFX9-NEXT: s_mul_i32 s12, s12, s16
+; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12
+; GFX9-NEXT: s_mul_i32 s15, s14, s12
+; GFX9-NEXT: s_mul_i32 s18, s16, s6
+; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12
+; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6
+; GFX9-NEXT: s_add_u32 s12, s12, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s6, s6, s16
-; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14
-; GFX9-NEXT: s_addc_u32 s6, s17, s15
+; GFX9-NEXT: s_add_u32 s12, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6
+; GFX9-NEXT: s_addc_u32 s12, s17, s13
; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s14, s12, s14
-; GFX9-NEXT: s_add_u32 s6, s6, s14
-; GFX9-NEXT: s_addc_u32 s7, 0, s7
-; GFX9-NEXT: s_add_u32 s13, s13, s6
-; GFX9-NEXT: s_addc_u32 s12, s12, s7
+; GFX9-NEXT: s_mul_i32 s6, s14, s6
+; GFX9-NEXT: s_add_u32 s6, s12, s6
+; GFX9-NEXT: s_addc_u32 s12, 0, s7
+; GFX9-NEXT: s_add_u32 s13, s16, s6
+; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-NEXT: s_addc_u32 s12, s14, s12
; GFX9-NEXT: s_ashr_i32 s6, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s6
; GFX9-NEXT: s_mov_b32 s7, s6
@@ -9808,9 +9868,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s12, s2, s12
; GFX9-NEXT: s_sub_u32 s8, s8, s12
; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s17, s14, s3
; GFX9-NEXT: s_sub_u32 s18, s8, s2
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
; GFX9-NEXT: s_subb_u32 s19, s17, 0
; GFX9-NEXT: s_cmp_ge_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, -1, 0
@@ -9819,11 +9881,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, s21, s20
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s14, s17, s3
-; GFX9-NEXT: s_sub_u32 s15, s18, s2
-; GFX9-NEXT: s_subb_u32 s14, s14, 0
+; GFX9-NEXT: s_subb_u32 s17, s17, s3
+; GFX9-NEXT: s_sub_u32 s21, s18, s2
+; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
+; GFX9-NEXT: s_subb_u32 s14, s17, 0
; GFX9-NEXT: s_cmp_lg_u32 s20, 0
-; GFX9-NEXT: s_cselect_b32 s15, s15, s18
+; GFX9-NEXT: s_cselect_b32 s15, s21, s18
; GFX9-NEXT: s_cselect_b32 s14, s14, s19
; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s9, s9, s16
@@ -9847,8 +9911,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s4, 0, s2
-; GFX9-NEXT: s_subb_u32 s5, 0, s3
+; GFX9-NEXT: s_sub_u32 s6, 0, s2
+; GFX9-NEXT: s_subb_u32 s7, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9858,70 +9922,74 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: v_readfirstlane_b32 s9, v2
-; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
-; GFX9-NEXT: s_mul_i32 s14, s4, s9
-; GFX9-NEXT: s_mul_i32 s7, s5, s6
+; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4
+; GFX9-NEXT: s_mul_i32 s14, s6, s9
+; GFX9-NEXT: s_mul_i32 s5, s7, s4
; GFX9-NEXT: s_add_i32 s8, s8, s14
-; GFX9-NEXT: s_add_i32 s8, s8, s7
-; GFX9-NEXT: s_mul_i32 s15, s4, s6
-; GFX9-NEXT: s_mul_i32 s14, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8
+; GFX9-NEXT: s_add_i32 s8, s8, s5
+; GFX9-NEXT: s_mul_i32 s15, s6, s4
+; GFX9-NEXT: s_mul_i32 s14, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15
+; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8
; GFX9-NEXT: s_add_u32 s14, s16, s14
-; GFX9-NEXT: s_addc_u32 s7, 0, s7
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15
; GFX9-NEXT: s_mul_i32 s15, s9, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8
-; GFX9-NEXT: s_addc_u32 s7, s7, s17
+; GFX9-NEXT: s_addc_u32 s5, s5, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
; GFX9-NEXT: s_mul_i32 s8, s9, s8
-; GFX9-NEXT: s_add_u32 s7, s7, s8
+; GFX9-NEXT: s_add_u32 s5, s5, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s14
-; GFX9-NEXT: s_add_u32 s6, s6, s7
-; GFX9-NEXT: s_addc_u32 s7, s9, s8
-; GFX9-NEXT: s_mul_i32 s8, s4, s7
-; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6
-; GFX9-NEXT: s_add_i32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s5, s5, s6
-; GFX9-NEXT: s_add_i32 s8, s8, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s6
-; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4
-; GFX9-NEXT: s_mul_i32 s14, s7, s4
-; GFX9-NEXT: s_mul_i32 s16, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8
-; GFX9-NEXT: s_add_u32 s4, s4, s16
+; GFX9-NEXT: s_add_u32 s14, s4, s5
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s8, s9, s8
+; GFX9-NEXT: s_mul_i32 s4, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14
+; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s7, s7, s14
+; GFX9-NEXT: s_add_i32 s4, s4, s7
+; GFX9-NEXT: s_mul_i32 s6, s6, s14
+; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6
+; GFX9-NEXT: s_mul_i32 s9, s8, s6
+; GFX9-NEXT: s_mul_i32 s16, s14, s4
+; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6
+; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
+; GFX9-NEXT: s_add_u32 s6, s6, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s4, s4, s14
-; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8
-; GFX9-NEXT: s_addc_u32 s4, s15, s9
+; GFX9-NEXT: s_add_u32 s6, s6, s9
+; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4
+; GFX9-NEXT: s_addc_u32 s6, s15, s7
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s8, s7, s8
-; GFX9-NEXT: s_add_u32 s4, s4, s8
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_u32 s8, s6, s4
-; GFX9-NEXT: s_addc_u32 s9, s7, s5
+; GFX9-NEXT: s_mul_i32 s4, s8, s4
+; GFX9-NEXT: s_add_u32 s4, s6, s4
+; GFX9-NEXT: s_addc_u32 s6, 0, s5
+; GFX9-NEXT: s_add_u32 s9, s14, s4
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s8, s8, s6
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s6, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s7, s11, s4
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s6, s9
-; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9
+; GFX9-NEXT: s_mul_i32 s11, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9
+; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8
; GFX9-NEXT: s_add_u32 s11, s14, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8
-; GFX9-NEXT: s_mul_i32 s8, s7, s8
-; GFX9-NEXT: s_add_u32 s8, s11, s8
-; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9
-; GFX9-NEXT: s_addc_u32 s8, s10, s15
-; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9
; GFX9-NEXT: s_mul_i32 s9, s7, s9
-; GFX9-NEXT: s_add_u32 s8, s8, s9
+; GFX9-NEXT: s_add_u32 s9, s11, s9
+; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8
+; GFX9-NEXT: s_addc_u32 s9, s10, s15
+; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_i32 s8, s7, s8
+; GFX9-NEXT: s_add_u32 s8, s9, s8
; GFX9-NEXT: s_addc_u32 s9, 0, s10
; GFX9-NEXT: s_mul_i32 s9, s2, s9
; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8
@@ -9932,9 +10000,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s8, s2, s8
; GFX9-NEXT: s_sub_u32 s6, s6, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s15, s10, s3
; GFX9-NEXT: s_sub_u32 s16, s6, s2
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s17, s15, 0
; GFX9-NEXT: s_cmp_ge_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, -1, 0
@@ -9943,11 +10013,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, s19, s18
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s15, s3
-; GFX9-NEXT: s_sub_u32 s11, s16, s2
-; GFX9-NEXT: s_subb_u32 s10, s10, 0
+; GFX9-NEXT: s_subb_u32 s15, s15, s3
+; GFX9-NEXT: s_sub_u32 s19, s16, s2
+; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GFX9-NEXT: s_subb_u32 s10, s15, 0
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_cselect_b32 s11, s11, s16
+; GFX9-NEXT: s_cselect_b32 s11, s19, s16
; GFX9-NEXT: s_cselect_b32 s10, s10, s17
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s7, s7, s14
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 01f4414..394727c 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -612,11 +612,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -652,11 +653,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -691,10 +693,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -730,10 +733,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -770,10 +774,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -813,10 +818,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -853,10 +859,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -894,15 +901,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -992,11 +999,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1034,11 +1042,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1075,10 +1084,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1117,10 +1127,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1160,10 +1171,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1206,10 +1218,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1248,10 +1261,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1292,15 +1306,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2059,11 +2073,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2099,11 +2114,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2138,10 +2154,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2177,10 +2194,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2217,10 +2235,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2260,10 +2279,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2301,10 +2321,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2342,15 +2363,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 9db6d70..258bc295 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -717,11 +717,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -761,11 +762,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -803,12 +805,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -850,10 +853,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -893,13 +897,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -944,10 +949,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -987,14 +993,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1022,7 +1028,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -1036,15 +1041,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2358,6 +2363,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2410,6 +2416,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2455,12 +2462,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2
; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2507,12 +2515,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1
; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2560,13 +2569,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2616,13 +2626,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2666,16 +2677,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2720,17 +2731,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4479,11 +4490,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4538,11 +4550,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4595,12 +4608,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4656,10 +4670,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4713,13 +4728,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2
; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4783,10 +4799,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4844,14 +4861,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4879,7 +4896,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4893,15 +4909,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -6657,6 +6673,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6729,6 +6746,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6794,12 +6812,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6864,12 +6883,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6935,13 +6955,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7015,13 +7036,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7087,16 +7109,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7141,17 +7163,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 6167a84..23c5f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -499,11 +499,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -539,11 +540,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -578,10 +580,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -618,10 +621,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -659,10 +663,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -702,10 +707,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1082,10 +1088,11 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1110,10 +1117,11 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1139,8 +1147,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1167,8 +1176,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1196,8 +1206,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1227,8 +1239,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2008,6 +2022,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2056,6 +2071,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2096,12 +2112,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2143,12 +2160,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2191,13 +2209,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2242,13 +2261,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2861,6 +2881,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2893,6 +2914,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2924,6 +2946,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2956,6 +2979,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2989,6 +3013,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3022,8 +3048,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3879,11 +3906,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3919,11 +3947,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3958,10 +3987,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3998,10 +4028,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4039,10 +4070,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4082,10 +4114,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4462,10 +4495,11 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4490,10 +4524,11 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4519,8 +4554,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4547,8 +4583,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4576,8 +4613,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4607,8 +4646,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5411,6 +5452,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5459,6 +5501,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5499,12 +5542,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5546,12 +5590,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5594,13 +5639,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -5645,13 +5691,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -6266,11 +6313,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6306,11 +6354,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6345,10 +6394,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6385,10 +6435,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6426,10 +6477,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6469,10 +6521,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6873,11 +6926,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6919,11 +6973,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6960,14 +7015,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7009,11 +7065,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7052,15 +7109,16 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7105,11 +7163,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7613,11 +7672,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7653,11 +7713,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7692,10 +7753,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7732,10 +7794,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7773,10 +7836,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -7816,10 +7880,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -8219,11 +8284,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8265,11 +8331,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8306,14 +8373,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8355,11 +8423,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8398,15 +8467,16 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8451,11 +8521,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8959,11 +9030,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8999,11 +9071,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9038,10 +9111,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9078,10 +9152,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9119,10 +9194,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9162,10 +9238,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9565,11 +9642,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9611,11 +9689,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9652,14 +9731,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9701,11 +9781,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9744,15 +9825,16 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
-; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9797,11 +9879,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10305,11 +10388,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10345,11 +10429,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10384,10 +10469,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10424,10 +10510,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10465,10 +10552,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -10508,10 +10596,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -11166,6 +11255,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11221,6 +11311,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11272,6 +11363,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11323,6 +11415,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11375,8 +11468,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11431,8 +11525,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12119,11 +12214,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12159,11 +12255,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12198,10 +12295,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12238,10 +12336,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12279,10 +12378,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12322,10 +12422,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12980,6 +13081,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13035,6 +13137,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13086,6 +13189,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13137,6 +13241,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13189,8 +13294,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13245,8 +13351,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13933,11 +14040,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13973,11 +14081,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14012,10 +14121,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14052,10 +14162,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14093,10 +14204,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14136,10 +14248,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14788,6 +14901,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14842,6 +14956,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14892,6 +15007,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14942,6 +15058,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14995,6 +15112,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15050,6 +15169,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15732,11 +15853,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15772,11 +15894,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15811,10 +15934,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15851,10 +15975,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15892,10 +16017,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -15935,10 +16061,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16588,6 +16715,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16642,6 +16770,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16692,6 +16821,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16742,6 +16872,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16795,6 +16926,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -16850,6 +16983,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 9afc0c6..e4def28 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -611,11 +611,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -651,11 +652,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -690,10 +692,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -729,10 +732,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -769,10 +773,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -812,10 +817,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -852,10 +858,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -893,15 +900,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1658,11 +1665,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1698,11 +1706,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1737,10 +1746,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1776,10 +1786,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1816,10 +1827,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1859,10 +1871,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1900,10 +1913,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1941,15 +1955,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 10fd34f..39a3c9a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -628,11 +628,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -669,11 +670,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -709,10 +711,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -749,10 +752,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -790,10 +794,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -834,10 +839,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -874,10 +880,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -916,15 +923,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1826,11 +1833,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s6, v0, s3
-; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: v_readlane_b32 s8, v0, s3
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_writelane_b32 v1, s2, m0
+; GFX8-NEXT: s_add_i32 s2, s2, s8
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1867,11 +1875,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s6, v0, s3
-; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: v_readlane_b32 s8, v0, s3
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_writelane_b32 v1, s2, m0
+; GFX9-NEXT: s_add_i32 s2, s2, s8
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1907,10 +1916,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_add_i32 s2, s2, s8
+; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1947,10 +1957,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
+; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
+; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1988,10 +1999,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_add_i32 s2, s2, s8
+; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2032,10 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2073,10 +2086,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
+; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2115,15 +2129,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
+; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b96de17..4a6fa4f 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -704,6 +704,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_add_u32 s4, s4, s6
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
; CISI-NEXT: s_or_b32 s6, s12, s13
+; CISI-NEXT: s_cmp_lg_u32 s6, 0
; CISI-NEXT: s_addc_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -724,14 +725,16 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_addc_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -743,10 +746,12 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s12, s14
-; GFX9-NEXT: s_addc_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_add_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_addc_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -759,8 +764,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_add_u32 s0, s12, s14
-; GFX1010-NEXT: s_addc_u32 s1, s13, s15
+; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1010-NEXT: s_addc_u32 s1, s13, s15
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -774,8 +781,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_add_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -789,8 +798,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_add_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -803,8 +814,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_addc_u32 s5, s5, s7
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -818,8 +831,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_add_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1676,6 +1691,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_sub_u32 s4, s4, s6
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
; CISI-NEXT: s_or_b32 s6, s12, s13
+; CISI-NEXT: s_cmp_lg_u32 s6, 0
; CISI-NEXT: s_subb_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -1696,14 +1712,16 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_subb_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -1715,10 +1733,12 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s0, s12, s14
-; GFX9-NEXT: s_subb_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_sub_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_subb_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -1731,8 +1751,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_sub_u32 s0, s12, s14
-; GFX1010-NEXT: s_subb_u32 s1, s13, s15
+; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1010-NEXT: s_subb_u32 s1, s13, s15
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1746,8 +1768,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1761,8 +1785,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1775,8 +1801,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_subb_u32 s5, s5, s7
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1790,8 +1818,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -2188,46 +2218,49 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: s_addc_u32 s6, s7, s9
; VI-NEXT: s_addc_u32 s8, s8, 0
; VI-NEXT: v_readfirstlane_b32 s7, v0
-; VI-NEXT: s_add_u32 s10, s6, s7
-; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: s_add_u32 s12, s6, s7
+; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0
-; VI-NEXT: s_addc_u32 s11, 0, s8
-; VI-NEXT: s_mul_i32 s8, s4, s11
+; VI-NEXT: s_addc_u32 s13, 0, s8
+; VI-NEXT: s_mul_i32 s8, s4, s13
; VI-NEXT: v_readfirstlane_b32 s9, v1
; VI-NEXT: s_add_i32 s8, s9, s8
-; VI-NEXT: s_mul_i32 s9, s5, s10
-; VI-NEXT: s_add_i32 s12, s8, s9
-; VI-NEXT: s_sub_i32 s13, s3, s12
+; VI-NEXT: s_mul_i32 s9, s5, s12
+; VI-NEXT: s_add_i32 s14, s8, s9
+; VI-NEXT: s_sub_i32 s10, s3, s14
; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: s_sub_u32 s14, s2, s8
+; VI-NEXT: s_sub_u32 s15, s2, s8
; VI-NEXT: s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT: s_subb_u32 s13, s13, s5
-; VI-NEXT: s_sub_u32 s15, s14, s4
-; VI-NEXT: s_subb_u32 s13, s13, 0
-; VI-NEXT: s_cmp_ge_u32 s13, s5
+; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
+; VI-NEXT: s_subb_u32 s16, s10, s5
+; VI-NEXT: s_sub_u32 s17, s15, s4
+; VI-NEXT: s_cselect_b64 s[10:11], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[10:11], 0
+; VI-NEXT: s_subb_u32 s10, s16, 0
+; VI-NEXT: s_cmp_ge_u32 s10, s5
+; VI-NEXT: s_cselect_b32 s11, -1, 0
+; VI-NEXT: s_cmp_ge_u32 s17, s4
; VI-NEXT: s_cselect_b32 s16, -1, 0
-; VI-NEXT: s_cmp_ge_u32 s15, s4
-; VI-NEXT: s_cselect_b32 s15, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s13, s5
-; VI-NEXT: s_cselect_b32 s13, s15, s16
-; VI-NEXT: s_add_u32 s15, s10, 1
-; VI-NEXT: s_addc_u32 s16, s11, 0
-; VI-NEXT: s_add_u32 s17, s10, 2
-; VI-NEXT: s_addc_u32 s18, s11, 0
-; VI-NEXT: s_cmp_lg_u32 s13, 0
-; VI-NEXT: s_cselect_b32 s13, s17, s15
-; VI-NEXT: s_cselect_b32 s15, s18, s16
+; VI-NEXT: s_cmp_eq_u32 s10, s5
+; VI-NEXT: s_cselect_b32 s10, s16, s11
+; VI-NEXT: s_add_u32 s11, s12, 1
+; VI-NEXT: s_addc_u32 s16, s13, 0
+; VI-NEXT: s_add_u32 s17, s12, 2
+; VI-NEXT: s_addc_u32 s18, s13, 0
+; VI-NEXT: s_cmp_lg_u32 s10, 0
+; VI-NEXT: s_cselect_b32 s10, s17, s11
+; VI-NEXT: s_cselect_b32 s11, s18, s16
; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT: s_subb_u32 s3, s3, s12
+; VI-NEXT: s_subb_u32 s3, s3, s14
; VI-NEXT: s_cmp_ge_u32 s3, s5
; VI-NEXT: s_cselect_b32 s8, -1, 0
-; VI-NEXT: s_cmp_ge_u32 s14, s4
+; VI-NEXT: s_cmp_ge_u32 s15, s4
; VI-NEXT: s_cselect_b32 s9, -1, 0
; VI-NEXT: s_cmp_eq_u32 s3, s5
; VI-NEXT: s_cselect_b32 s3, s9, s8
; VI-NEXT: s_cmp_lg_u32 s3, 0
-; VI-NEXT: s_cselect_b32 s9, s15, s11
-; VI-NEXT: s_cselect_b32 s8, s13, s10
+; VI-NEXT: s_cselect_b32 s9, s11, s13
+; VI-NEXT: s_cselect_b32 s8, s10, s12
; VI-NEXT: s_cbranch_execnz .LBB16_4
; VI-NEXT: .LBB16_2:
; VI-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2278,8 +2311,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s8, 0, s6
-; GFX9-NEXT: s_subb_u32 s9, 0, s7
+; GFX9-NEXT: s_sub_u32 s10, 0, s6
+; GFX9-NEXT: s_subb_u32 s11, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2288,102 +2321,109 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s10, v1
-; GFX9-NEXT: v_readfirstlane_b32 s11, v0
-; GFX9-NEXT: s_mul_i32 s12, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11
-; GFX9-NEXT: s_mul_i32 s13, s9, s11
-; GFX9-NEXT: s_add_i32 s12, s14, s12
-; GFX9-NEXT: s_add_i32 s12, s12, s13
-; GFX9-NEXT: s_mul_i32 s15, s8, s11
-; GFX9-NEXT: s_mul_i32 s14, s11, s12
-; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15
-; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
+; GFX9-NEXT: v_readfirstlane_b32 s12, v1
+; GFX9-NEXT: v_readfirstlane_b32 s8, v0
+; GFX9-NEXT: s_mul_i32 s9, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8
+; GFX9-NEXT: s_mul_i32 s13, s11, s8
+; GFX9-NEXT: s_add_i32 s9, s14, s9
+; GFX9-NEXT: s_add_i32 s9, s9, s13
+; GFX9-NEXT: s_mul_i32 s15, s10, s8
+; GFX9-NEXT: s_mul_i32 s14, s8, s9
+; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15
+; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9
; GFX9-NEXT: s_add_u32 s14, s16, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
-; GFX9-NEXT: s_mul_i32 s15, s10, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT: s_mul_i32 s15, s12, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s12, s10, s12
-; GFX9-NEXT: s_add_u32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s9, s12, s9
+; GFX9-NEXT: s_add_u32 s9, s13, s9
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s11, s11, s12
-; GFX9-NEXT: s_addc_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_i32 s12, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11
-; GFX9-NEXT: s_add_i32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s9, s9, s11
-; GFX9-NEXT: s_add_i32 s12, s12, s9
-; GFX9-NEXT: s_mul_i32 s8, s8, s11
-; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8
-; GFX9-NEXT: s_mul_i32 s14, s10, s8
-; GFX9-NEXT: s_mul_i32 s16, s11, s12
-; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8
-; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
-; GFX9-NEXT: s_add_u32 s8, s8, s16
+; GFX9-NEXT: s_add_u32 s14, s8, s9
+; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX9-NEXT: s_addc_u32 s12, s12, s13
+; GFX9-NEXT: s_mul_i32 s8, s10, s12
+; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14
+; GFX9-NEXT: s_add_i32 s8, s9, s8
+; GFX9-NEXT: s_mul_i32 s11, s11, s14
+; GFX9-NEXT: s_add_i32 s8, s8, s11
+; GFX9-NEXT: s_mul_i32 s10, s10, s14
+; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
+; GFX9-NEXT: s_mul_i32 s13, s12, s10
+; GFX9-NEXT: s_mul_i32 s16, s14, s8
+; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
+; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8
+; GFX9-NEXT: s_add_u32 s10, s10, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s8, s8, s14
-; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12
-; GFX9-NEXT: s_addc_u32 s8, s15, s13
+; GFX9-NEXT: s_add_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
+; GFX9-NEXT: s_addc_u32 s10, s15, s11
; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_mul_i32 s12, s10, s12
-; GFX9-NEXT: s_add_u32 s8, s8, s12
+; GFX9-NEXT: s_mul_i32 s8, s12, s8
+; GFX9-NEXT: s_add_u32 s8, s10, s8
+; GFX9-NEXT: s_addc_u32 s10, 0, s9
+; GFX9-NEXT: s_add_u32 s11, s14, s8
+; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX9-NEXT: s_addc_u32 s8, s12, s10
+; GFX9-NEXT: s_mul_i32 s10, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11
+; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8
+; GFX9-NEXT: s_add_u32 s10, s12, s10
; GFX9-NEXT: s_addc_u32 s9, 0, s9
-; GFX9-NEXT: s_add_u32 s8, s11, s8
-; GFX9-NEXT: s_addc_u32 s9, s10, s9
-; GFX9-NEXT: s_mul_i32 s11, s2, s9
-; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8
-; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9
-; GFX9-NEXT: s_add_u32 s11, s12, s11
-; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8
-; GFX9-NEXT: s_mul_i32 s8, s3, s8
-; GFX9-NEXT: s_add_u32 s8, s11, s8
-; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9
-; GFX9-NEXT: s_addc_u32 s8, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11
+; GFX9-NEXT: s_mul_i32 s11, s3, s11
+; GFX9-NEXT: s_add_u32 s10, s10, s11
+; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8
+; GFX9-NEXT: s_addc_u32 s9, s9, s13
; GFX9-NEXT: s_addc_u32 s10, s12, 0
-; GFX9-NEXT: s_mul_i32 s9, s3, s9
-; GFX9-NEXT: s_add_u32 s11, s8, s9
-; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_i32 s8, s6, s10
-; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11
+; GFX9-NEXT: s_mul_i32 s8, s3, s8
+; GFX9-NEXT: s_add_u32 s12, s9, s8
+; GFX9-NEXT: s_addc_u32 s13, 0, s10
+; GFX9-NEXT: s_mul_i32 s8, s6, s13
+; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12
; GFX9-NEXT: s_add_i32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s9, s7, s11
-; GFX9-NEXT: s_add_i32 s12, s8, s9
-; GFX9-NEXT: s_sub_i32 s13, s3, s12
-; GFX9-NEXT: s_mul_i32 s8, s6, s11
-; GFX9-NEXT: s_sub_u32 s14, s2, s8
+; GFX9-NEXT: s_mul_i32 s9, s7, s12
+; GFX9-NEXT: s_add_i32 s14, s8, s9
+; GFX9-NEXT: s_sub_i32 s10, s3, s14
+; GFX9-NEXT: s_mul_i32 s8, s6, s12
+; GFX9-NEXT: s_sub_u32 s15, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_subb_u32 s13, s13, s7
-; GFX9-NEXT: s_sub_u32 s15, s14, s6
-; GFX9-NEXT: s_subb_u32 s13, s13, 0
-; GFX9-NEXT: s_cmp_ge_u32 s13, s7
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX9-NEXT: s_subb_u32 s16, s10, s7
+; GFX9-NEXT: s_sub_u32 s17, s15, s6
+; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GFX9-NEXT: s_subb_u32 s10, s16, 0
+; GFX9-NEXT: s_cmp_ge_u32 s10, s7
+; GFX9-NEXT: s_cselect_b32 s11, -1, 0
+; GFX9-NEXT: s_cmp_ge_u32 s17, s6
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s15, s6
-; GFX9-NEXT: s_cselect_b32 s15, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s13, s7
-; GFX9-NEXT: s_cselect_b32 s13, s15, s16
-; GFX9-NEXT: s_add_u32 s15, s11, 1
-; GFX9-NEXT: s_addc_u32 s16, s10, 0
-; GFX9-NEXT: s_add_u32 s17, s11, 2
-; GFX9-NEXT: s_addc_u32 s18, s10, 0
-; GFX9-NEXT: s_cmp_lg_u32 s13, 0
-; GFX9-NEXT: s_cselect_b32 s13, s17, s15
-; GFX9-NEXT: s_cselect_b32 s15, s18, s16
+; GFX9-NEXT: s_cmp_eq_u32 s10, s7
+; GFX9-NEXT: s_cselect_b32 s10, s16, s11
+; GFX9-NEXT: s_add_u32 s11, s12, 1
+; GFX9-NEXT: s_addc_u32 s16, s13, 0
+; GFX9-NEXT: s_add_u32 s17, s12, 2
+; GFX9-NEXT: s_addc_u32 s18, s13, 0
+; GFX9-NEXT: s_cmp_lg_u32 s10, 0
+; GFX9-NEXT: s_cselect_b32 s10, s17, s11
+; GFX9-NEXT: s_cselect_b32 s11, s18, s16
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s12
+; GFX9-NEXT: s_subb_u32 s3, s3, s14
; GFX9-NEXT: s_cmp_ge_u32 s3, s7
; GFX9-NEXT: s_cselect_b32 s8, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s14, s6
+; GFX9-NEXT: s_cmp_ge_u32 s15, s6
; GFX9-NEXT: s_cselect_b32 s9, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s3, s7
; GFX9-NEXT: s_cselect_b32 s3, s9, s8
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
-; GFX9-NEXT: s_cselect_b32 s9, s15, s10
-; GFX9-NEXT: s_cselect_b32 s8, s13, s11
+; GFX9-NEXT: s_cselect_b32 s9, s11, s13
+; GFX9-NEXT: s_cselect_b32 s8, s10, s12
; GFX9-NEXT: s_cbranch_execnz .LBB16_3
; GFX9-NEXT: .LBB16_2:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
@@ -2463,40 +2503,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_add_u32 s11, s12, s11
; GFX1010-NEXT: s_addc_u32 s12, 0, s13
; GFX1010-NEXT: s_add_u32 s8, s8, s11
+; GFX1010-NEXT: s_cselect_b32 s11, -1, 0
+; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8
+; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1010-NEXT: s_mul_i32 s11, s9, s8
; GFX1010-NEXT: s_addc_u32 s5, s5, s12
-; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1010-NEXT: s_mul_i32 s12, s9, s8
-; GFX1010-NEXT: s_mul_i32 s9, s9, s5
; GFX1010-NEXT: s_mul_i32 s10, s10, s8
-; GFX1010-NEXT: s_add_i32 s9, s11, s9
-; GFX1010-NEXT: s_mul_i32 s11, s5, s12
+; GFX1010-NEXT: s_mul_i32 s9, s9, s5
+; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11
+; GFX1010-NEXT: s_add_i32 s9, s13, s9
+; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11
; GFX1010-NEXT: s_add_i32 s9, s9, s10
-; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX1010-NEXT: s_mul_i32 s10, s5, s11
; GFX1010-NEXT: s_mul_i32 s15, s8, s9
; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1010-NEXT: s_add_u32 s10, s10, s15
-; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12
+; GFX1010-NEXT: s_add_u32 s12, s12, s15
; GFX1010-NEXT: s_addc_u32 s14, 0, s14
-; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9
-; GFX1010-NEXT: s_add_u32 s10, s10, s11
+; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9
+; GFX1010-NEXT: s_add_u32 s10, s12, s10
; GFX1010-NEXT: s_mul_i32 s9, s5, s9
; GFX1010-NEXT: s_addc_u32 s10, s14, s13
-; GFX1010-NEXT: s_addc_u32 s11, s12, 0
+; GFX1010-NEXT: s_addc_u32 s11, s11, 0
; GFX1010-NEXT: s_add_u32 s9, s10, s9
; GFX1010-NEXT: s_addc_u32 s10, 0, s11
; GFX1010-NEXT: s_add_u32 s8, s8, s9
+; GFX1010-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8
+; GFX1010-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1010-NEXT: s_addc_u32 s5, s5, s10
-; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1010-NEXT: s_mul_i32 s12, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8
; GFX1010-NEXT: s_mul_i32 s8, s3, s8
-; GFX1010-NEXT: s_add_u32 s9, s9, s12
-; GFX1010-NEXT: s_addc_u32 s11, 0, s11
+; GFX1010-NEXT: s_mul_i32 s12, s2, s5
+; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5
+; GFX1010-NEXT: s_add_u32 s11, s11, s12
+; GFX1010-NEXT: s_addc_u32 s10, 0, s10
; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT: s_add_u32 s8, s9, s8
+; GFX1010-NEXT: s_add_u32 s8, s11, s8
; GFX1010-NEXT: s_mul_i32 s5, s3, s5
-; GFX1010-NEXT: s_addc_u32 s8, s11, s10
+; GFX1010-NEXT: s_addc_u32 s8, s10, s9
; GFX1010-NEXT: s_addc_u32 s9, s13, 0
; GFX1010-NEXT: s_add_u32 s5, s8, s5
; GFX1010-NEXT: s_addc_u32 s8, 0, s9
@@ -2509,8 +2553,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_sub_i32 s11, s3, s9
; GFX1010-NEXT: s_sub_u32 s10, s2, s10
; GFX1010-NEXT: s_cselect_b32 s12, -1, 0
+; GFX1010-NEXT: s_cmp_lg_u32 s12, 0
; GFX1010-NEXT: s_subb_u32 s11, s11, s7
; GFX1010-NEXT: s_sub_u32 s13, s10, s6
+; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
+; GFX1010-NEXT: s_cmp_lg_u32 s14, 0
; GFX1010-NEXT: s_subb_u32 s11, s11, 0
; GFX1010-NEXT: s_cmp_ge_u32 s11, s7
; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
@@ -2616,40 +2663,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_add_u32 s11, s12, s11
; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13
; GFX1030W32-NEXT: s_add_u32 s8, s8, s11
+; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8
+; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8
-; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7
; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
-; GFX1030W32-NEXT: s_add_i32 s9, s11, s9
-; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12
+; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7
+; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11
+; GFX1030W32-NEXT: s_add_i32 s9, s13, s9
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11
; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11
; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9
; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1030W32-NEXT: s_add_u32 s10, s10, s15
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12
+; GFX1030W32-NEXT: s_add_u32 s12, s12, s15
; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9
-; GFX1030W32-NEXT: s_add_u32 s10, s10, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9
+; GFX1030W32-NEXT: s_add_u32 s10, s12, s10
; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9
; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13
-; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0
+; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0
; GFX1030W32-NEXT: s_add_u32 s9, s10, s9
; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11
; GFX1030W32-NEXT: s_add_u32 s8, s8, s9
+; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8
+; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8
; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
-; GFX1030W32-NEXT: s_add_u32 s9, s9, s12
-; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11
+; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7
+; GFX1030W32-NEXT: s_add_u32 s11, s11, s12
+; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10
; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT: s_add_u32 s8, s9, s8
+; GFX1030W32-NEXT: s_add_u32 s8, s11, s8
; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10
+; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9
; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0
; GFX1030W32-NEXT: s_add_u32 s7, s8, s7
; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9
@@ -2662,8 +2713,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9
; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10
; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0
+; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0
; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5
; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4
+; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
+; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0
; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0
; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5
; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
@@ -2736,8 +2790,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: ; %bb.1:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5
-; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4
-; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5
+; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4
+; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0
; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2746,102 +2800,109 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1
-; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0
-; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7
-; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7
-; GFX1030W64-NEXT: s_add_i32 s10, s12, s10
-; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7
-; GFX1030W64-NEXT: s_add_i32 s10, s10, s11
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13
-; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10
-; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13
-; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10
+; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1
+; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0
+; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6
+; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6
+; GFX1030W64-NEXT: s_add_i32 s7, s12, s7
+; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6
+; GFX1030W64-NEXT: s_add_i32 s7, s7, s11
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13
+; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7
+; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13
+; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7
; GFX1030W64-NEXT: s_add_u32 s12, s12, s15
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10
+; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7
; GFX1030W64-NEXT: s_add_u32 s11, s12, s11
-; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10
+; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7
; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14
; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0
-; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
+; GFX1030W64-NEXT: s_add_u32 s7, s11, s7
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12
-; GFX1030W64-NEXT: s_add_u32 s7, s7, s10
-; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11
-; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7
-; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7
-; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6
-; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7
-; GFX1030W64-NEXT: s_add_i32 s8, s10, s8
-; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11
-; GFX1030W64-NEXT: s_add_i32 s8, s8, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11
-; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8
-; GFX1030W64-NEXT: s_add_u32 s9, s9, s14
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11
+; GFX1030W64-NEXT: s_add_u32 s12, s6, s7
+; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12
+; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11
+; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12
+; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6
+; GFX1030W64-NEXT: s_add_i32 s9, s13, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6
+; GFX1030W64-NEXT: s_add_i32 s9, s9, s10
+; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6
+; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9
+; GFX1030W64-NEXT: s_add_u32 s7, s7, s14
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8
-; GFX1030W64-NEXT: s_add_u32 s9, s9, s10
-; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8
-; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12
-; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0
-; GFX1030W64-NEXT: s_add_u32 s8, s9, s8
-; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10
-; GFX1030W64-NEXT: s_add_u32 s7, s7, s8
-; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7
-; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6
-; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6
-; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7
+; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9
+; GFX1030W64-NEXT: s_add_u32 s6, s7, s6
+; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9
+; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11
+; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0
+; GFX1030W64-NEXT: s_add_u32 s6, s6, s9
+; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7
+; GFX1030W64-NEXT: s_add_u32 s10, s12, s6
+; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10
+; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9
+; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10
+; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7
+; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
+; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7
+; GFX1030W64-NEXT: s_add_u32 s8, s10, s8
; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W64-NEXT: s_add_u32 s8, s8, s11
-; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6
-; GFX1030W64-NEXT: s_add_u32 s7, s8, s7
-; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6
-; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9
+; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6
; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0
-; GFX1030W64-NEXT: s_add_u32 s10, s7, s6
+; GFX1030W64-NEXT: s_add_u32 s10, s6, s7
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8
; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10
; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11
; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10
; GFX1030W64-NEXT: s_add_i32 s6, s6, s7
-; GFX1030W64-NEXT: s_add_i32 s8, s6, s8
+; GFX1030W64-NEXT: s_add_i32 s12, s6, s8
; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10
-; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8
-; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6
+; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12
+; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6
; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5
-; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4
-; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5
+; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4
+; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0
+; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5
+; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4
; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
-; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0
-; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5
-; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14
-; GFX1030W64-NEXT: s_add_u32 s13, s10, 1
+; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5
+; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9
+; GFX1030W64-NEXT: s_add_u32 s9, s10, 1
; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0
; GFX1030W64-NEXT: s_add_u32 s15, s10, 2
; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0
-; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13
+; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0
+; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9
; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14
; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8
+; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12
; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5
; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4
+; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0
; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5
; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6
; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0
; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11
-; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10
+; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10
; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3
; GFX1030W64-NEXT: .LBB16_2:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2927,40 +2988,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_u32 s11, s12, s11
; GFX11-NEXT: s_addc_u32 s12, 0, s13
; GFX11-NEXT: s_add_u32 s8, s8, s11
+; GFX11-NEXT: s_cselect_b32 s11, -1, 0
+; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8
+; GFX11-NEXT: s_cmp_lg_u32 s11, 0
+; GFX11-NEXT: s_mul_i32 s11, s9, s8
; GFX11-NEXT: s_addc_u32 s7, s7, s12
-; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX11-NEXT: s_mul_i32 s12, s9, s8
-; GFX11-NEXT: s_mul_i32 s9, s9, s7
; GFX11-NEXT: s_mul_i32 s10, s10, s8
-; GFX11-NEXT: s_add_i32 s9, s11, s9
-; GFX11-NEXT: s_mul_i32 s11, s7, s12
+; GFX11-NEXT: s_mul_i32 s9, s9, s7
+; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11
+; GFX11-NEXT: s_add_i32 s9, s13, s9
+; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11
; GFX11-NEXT: s_add_i32 s9, s9, s10
-; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX11-NEXT: s_mul_i32 s10, s7, s11
; GFX11-NEXT: s_mul_i32 s15, s8, s9
; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX11-NEXT: s_add_u32 s10, s10, s15
-; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12
+; GFX11-NEXT: s_add_u32 s12, s12, s15
; GFX11-NEXT: s_addc_u32 s14, 0, s14
-; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9
-; GFX11-NEXT: s_add_u32 s10, s10, s11
+; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9
+; GFX11-NEXT: s_add_u32 s10, s12, s10
; GFX11-NEXT: s_mul_i32 s9, s7, s9
; GFX11-NEXT: s_addc_u32 s10, s14, s13
-; GFX11-NEXT: s_addc_u32 s11, s12, 0
+; GFX11-NEXT: s_addc_u32 s11, s11, 0
; GFX11-NEXT: s_add_u32 s9, s10, s9
; GFX11-NEXT: s_addc_u32 s10, 0, s11
; GFX11-NEXT: s_add_u32 s8, s8, s9
+; GFX11-NEXT: s_cselect_b32 s9, -1, 0
+; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX11-NEXT: s_addc_u32 s7, s7, s10
-; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX11-NEXT: s_mul_i32 s12, s2, s7
-; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7
-; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8
; GFX11-NEXT: s_mul_i32 s8, s3, s8
-; GFX11-NEXT: s_add_u32 s9, s9, s12
-; GFX11-NEXT: s_addc_u32 s11, 0, s11
+; GFX11-NEXT: s_mul_i32 s12, s2, s7
+; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7
+; GFX11-NEXT: s_add_u32 s11, s11, s12
+; GFX11-NEXT: s_addc_u32 s10, 0, s10
; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX11-NEXT: s_add_u32 s8, s9, s8
+; GFX11-NEXT: s_add_u32 s8, s11, s8
; GFX11-NEXT: s_mul_i32 s7, s3, s7
-; GFX11-NEXT: s_addc_u32 s8, s11, s10
+; GFX11-NEXT: s_addc_u32 s8, s10, s9
; GFX11-NEXT: s_addc_u32 s9, s13, 0
; GFX11-NEXT: s_add_u32 s7, s8, s7
; GFX11-NEXT: s_addc_u32 s8, 0, s9
@@ -2970,14 +3035,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_i32 s9, s9, s10
; GFX11-NEXT: s_mul_i32 s10, s4, s7
; GFX11-NEXT: s_add_i32 s9, s9, s11
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s11, s3, s9
; GFX11-NEXT: s_sub_u32 s10, s2, s10
; GFX11-NEXT: s_cselect_b32 s12, -1, 0
+; GFX11-NEXT: s_cmp_lg_u32 s12, 0
; GFX11-NEXT: s_subb_u32 s11, s11, s5
; GFX11-NEXT: s_sub_u32 s13, s10, s4
+; GFX11-NEXT: s_cselect_b32 s14, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s14, 0
; GFX11-NEXT: s_subb_u32 s11, s11, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_ge_u32 s11, s5
; GFX11-NEXT: s_cselect_b32 s14, -1, 0
; GFX11-NEXT: s_cmp_ge_u32 s13, s4
@@ -3050,8 +3118,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
+; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1250-NEXT: ; %bb.1:
; GFX1250-NEXT: s_cvt_f32_u32 s4, s6
@@ -3086,9 +3155,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s12
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_cmp_lg_u32 s4, 0
; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11
; GFX1250-NEXT: s_mul_i32 s12, s8, s11
; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10
@@ -3103,17 +3175,19 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s10
-; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11
+; GFX1250-NEXT: s_cselect_b32 s10, -1, 0
; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
-; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8
-; GFX1250-NEXT: s_mul_i32 s12, s3, s8
+; GFX1250-NEXT: s_cmp_lg_u32 s10, 0
+; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8
+; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11
+; GFX1250-NEXT: s_mul_i32 s11, s3, s8
; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10
; GFX1250-NEXT: s_mul_i32 s8, s2, s10
; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9]
; GFX1250-NEXT: s_mul_i32 s10, s3, s10
-; GFX1250-NEXT: s_add_co_u32 s4, s8, s12
-; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11
+; GFX1250-NEXT: s_add_co_u32 s4, s8, s11
+; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12
; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11]
@@ -3128,8 +3202,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_cmp_lg_u32 s8, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7
; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6
+; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_cmp_lg_u32 s14, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_cmp_ge_u32 s12, s7
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
; GFX1250-NEXT: s_cmp_ge_u32 s13, s6
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 07e6a76..4b151b9 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -714,8 +714,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: s_lshl_b32 s2, s2, 8
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: s_lshl_b32 s3, s2, 16
-; VI-NEXT: s_flbit_i32_b32 s3, s3
; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_flbit_i32_b32 s3, s3
+; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, s3, 32
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index fca57be..cefcbdd 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1491,6 +1491,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s4, s6, 16
+; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: s_cbranch_scc0 .LBB14_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mov_b32 s11, 0xf000
@@ -1520,6 +1521,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cbranch_scc0 .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s11, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index dbdea8e..d8a5e7fa 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,6 +14,7 @@ define i32 @s_add_co_select_user() {
; GFX7-NEXT: s_add_u32 s7, s6, s6
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_or_b32 s4, s4, s5
+; GFX7-NEXT: s_cmp_lg_u32 s4, 0
; GFX7-NEXT: s_addc_u32 s8, s6, 0
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -30,6 +31,8 @@ define i32 @s_add_co_select_user() {
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s7, s6, s6
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_addc_u32 s8, s6, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -46,6 +49,8 @@ define i32 @s_add_co_select_user() {
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s5, s4, s4
+; GFX10-NEXT: s_cselect_b32 s6, -1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_addc_u32 s6, s4, 0
; GFX10-NEXT: s_cselect_b32 s7, -1, 0
; GFX10-NEXT: s_and_b32 s7, s7, exec_lo
@@ -62,13 +67,16 @@ define i32 @s_add_co_select_user() {
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s1, s0, s0
+; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_addc_u32 s2, s0, 0
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, 0
; GFX11-NEXT: s_cmp_gt_u32 s0, 31
; GFX11-NEXT: s_cselect_b32 s0, s1, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
@@ -96,6 +104,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-NEXT: s_add_u32 s0, s2, s2
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_or_b32 s0, s0, s1
+; GFX7-NEXT: s_cmp_lg_u32 s0, 0
; GFX7-NEXT: s_addc_u32 s0, s2, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1]
@@ -116,10 +125,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
;
; GFX9-LABEL: s_add_co_br_user:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s1, s0, s0
-; GFX9-NEXT: s_addc_u32 s0, s0, 0
+; GFX9-NEXT: s_add_u32 s0, s2, s2
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_addc_u32 s0, s2, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX9-NEXT: s_cbranch_vccnz .LBB1_2
@@ -142,6 +153,8 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s1, s0, s0
+; GFX10-NEXT: s_cselect_b32 s1, -1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0
@@ -165,9 +178,11 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s1, s0, s0
+; GFX11-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 9a17538..62847b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1117,6 +1117,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s3, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s3, s0
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: s_lshr_b32 s0, s1, 8
@@ -1168,6 +1169,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s3, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s3, s0
+; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; VI-NEXT: s_lshr_b32 s0, s1, 8
@@ -1215,6 +1217,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s3, s0
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
@@ -1261,9 +1264,11 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8
@@ -1315,9 +1320,11 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8
@@ -4016,6 +4023,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; SI-NEXT: s_and_b32 s6, s4, 0xffe
; SI-NEXT: s_and_b32 s4, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s4, s0
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: v_cvt_f16_f32_e32 v0, s5
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
@@ -4058,6 +4066,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; SI-NEXT: s_and_b32 s5, s0, 0xffe
; SI-NEXT: s_and_b32 s0, s3, 0x1ff
; SI-NEXT: s_or_b32 s0, s0, s2
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; SI-NEXT: v_readfirstlane_b32 s0, v2
@@ -4111,9 +4120,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_lshr_b32 s5, s3, 8
-; VI-NEXT: s_and_b32 s5, s5, 0xffe
; VI-NEXT: s_and_b32 s6, s3, 0x1ff
+; VI-NEXT: s_and_b32 s5, s5, 0xffe
; VI-NEXT: s_or_b32 s2, s6, s2
+; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014
@@ -4153,6 +4163,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-NEXT: s_and_b32 s7, s2, 0xffe
; VI-NEXT: s_and_b32 s2, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s2, s0
+; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014
@@ -4198,9 +4209,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s5, s3, 8
-; GFX9-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX9-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-NEXT: s_or_b32 s2, s6, s2
+; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014
@@ -4242,6 +4254,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX9-NEXT: s_and_b32 s6, s2, 0xffe
; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s2, s0
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
@@ -4288,10 +4301,11 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
;
; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-NEXT: s_or_b32 s2, s6, s2
+; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX11-NEXT: s_lshr_b32 s6, s3, 8
+; GFX11-NEXT: s_or_b32 s2, s5, s2
+; GFX11-NEXT: s_and_b32 s5, s6, 0xffe
+; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
@@ -4334,12 +4348,13 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-NEXT: s_cselect_b32 s2, s5, s6
; GFX11-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
; GFX11-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
+; GFX11-NEXT: s_or_b32 s0, s6, s0
; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s0, s6, s0
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index c28b25c7..b0dd187 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -599,8 +599,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s7, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s6, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12
@@ -709,8 +711,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -820,8 +824,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -931,8 +937,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -1110,15 +1118,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1165,15 +1175,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1354,15 +1366,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1409,15 +1423,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -2138,8 +2154,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s9, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9
; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12
@@ -2175,10 +2193,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
+; SI-GISEL-NEXT: s_or_b32 s6, s9, s6
; SI-GISEL-NEXT: s_or_b32 s3, s4, s3
-; SI-GISEL-NEXT: s_or_b32 s4, s9, s6
+; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12
@@ -2335,8 +2355,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; VI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_or_b32 s3, s3, s4
+; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2
; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2370,12 +2392,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
-; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_or_b32 s5, s5, s6
+; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_or_b32 s4, s4, s5
+; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3
; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -2531,8 +2555,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2566,12 +2592,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
-; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -2724,8 +2752,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2759,12 +2789,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
-; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -3041,15 +3073,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3081,17 +3115,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3140,15 +3176,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3180,17 +3218,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3471,15 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3511,17 +3553,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
+; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3570,15 +3614,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3610,17 +3656,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
+; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 0deef8b..5d31177 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; SI-NEXT: s_and_b32 s1, s7, 0x1ff
; SI-NEXT: s_and_b32 s8, s0, 0xffe
; SI-NEXT: s_or_b32 s0, s1, s6
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014
@@ -236,6 +237,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe
; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff
; VI-SDAG-NEXT: s_or_b32 s4, s4, s6
+; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -288,8 +290,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -331,10 +335,11 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8
-; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
-; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
-; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2
+; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
+; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8
+; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2
+; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
+; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
@@ -382,14 +387,16 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4
; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
@@ -431,10 +438,11 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8
-; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
-; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
-; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2
+; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
+; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2
+; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
@@ -490,15 +498,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 31f277f..37756d1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -472,6 +472,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -535,10 +536,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -604,6 +606,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -657,11 +660,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -706,8 +710,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1685,6 +1690,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1748,10 +1754,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1817,6 +1824,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1870,11 +1878,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1919,8 +1928,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2958,6 +2968,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3021,10 +3032,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3090,6 +3102,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3143,11 +3156,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3192,8 +3206,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3727,6 +3742,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3790,10 +3806,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3859,6 +3876,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3912,11 +3930,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3961,8 +3980,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4999,6 +5019,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5062,10 +5083,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5131,6 +5153,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5184,11 +5207,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5246,8 +5270,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6259,6 +6284,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6328,6 +6354,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6397,6 +6424,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6457,6 +6485,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6520,6 +6550,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7686,6 +7717,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7755,6 +7787,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7824,6 +7857,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7884,6 +7918,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7947,6 +7983,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9113,6 +9150,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9182,6 +9220,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9251,6 +9290,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9311,6 +9351,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9374,6 +9416,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10022,6 +10065,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10091,6 +10135,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10160,6 +10205,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10220,6 +10266,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10283,6 +10331,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11449,6 +11498,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11518,6 +11568,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11587,6 +11638,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11647,6 +11699,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11710,6 +11764,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 4581efc..6351bb3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -381,12 +381,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -456,6 +457,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -511,6 +513,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
@@ -559,7 +562,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -606,9 +610,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1414,12 +1420,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1489,6 +1496,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1544,6 +1552,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1592,7 +1601,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1639,9 +1649,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2447,12 +2459,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2522,6 +2535,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2577,6 +2591,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
@@ -2625,7 +2640,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2672,9 +2688,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3573,6 +3591,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3646,6 +3665,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3704,6 +3724,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3753,7 +3774,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3819,9 +3841,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4836,6 +4859,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4909,6 +4933,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4967,6 +4992,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5016,7 +5042,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5082,9 +5109,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6099,6 +6127,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6172,6 +6201,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6230,6 +6260,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6279,7 +6310,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6345,9 +6377,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index bd570d9..a9ac008 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -381,12 +381,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -456,6 +457,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -511,6 +513,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
@@ -559,7 +562,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -606,9 +610,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1414,12 +1420,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1489,6 +1496,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1544,6 +1552,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1592,7 +1601,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1639,9 +1649,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2447,12 +2459,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s3, v0, s2
-; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
-; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2522,6 +2535,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2577,6 +2591,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
@@ -2625,7 +2640,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2672,9 +2688,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3573,6 +3591,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3646,6 +3665,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3704,6 +3724,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3753,7 +3774,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3819,9 +3841,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4836,6 +4859,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4909,6 +4933,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4967,6 +4992,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5016,7 +5042,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5082,9 +5109,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6099,6 +6127,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6172,6 +6201,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6230,6 +6260,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6279,7 +6310,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6345,9 +6377,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 1f2d70c..6311143 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -532,6 +532,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -595,10 +596,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -664,6 +666,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -717,11 +720,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -779,8 +783,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1857,6 +1862,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1920,10 +1926,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1989,6 +1996,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2042,11 +2050,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2104,8 +2113,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3182,6 +3192,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3245,10 +3256,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3314,6 +3326,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3367,11 +3380,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3429,8 +3443,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4003,6 +4018,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4066,10 +4082,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4135,6 +4152,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4188,11 +4206,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4250,8 +4269,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5327,6 +5347,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5390,10 +5411,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5459,6 +5481,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5512,11 +5535,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5574,8 +5598,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6587,6 +6612,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6656,6 +6682,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6725,6 +6752,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6785,6 +6813,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6848,6 +6878,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8013,6 +8044,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8082,6 +8114,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8151,6 +8184,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8211,6 +8245,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8274,6 +8310,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9440,6 +9477,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9509,6 +9547,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9578,6 +9617,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9638,6 +9678,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9701,6 +9743,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10349,6 +10392,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10418,6 +10462,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10487,6 +10532,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10547,6 +10593,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10610,6 +10658,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11775,6 +11824,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11844,6 +11894,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11913,6 +11964,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11973,6 +12025,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12036,6 +12090,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index c3f3917..eee232a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -136,17 +136,19 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: .LBB2_6: ; %bb18
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_cselect_b32 s13, -1, 0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
-; GFX11-NEXT: s_and_b32 s13, s8, s13
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_and_b32 s13, s13, exec_lo
+; GFX11-NEXT: v_readfirstlane_b32 s13, v0
+; GFX11-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT: s_and_b32 s1, s8, s1
+; GFX11-NEXT: s_and_b32 s1, s1, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
-; GFX11-NEXT: s_cselect_b32 s1, s19, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s1, s1, 1
+; GFX11-NEXT: s_cselect_b32 s1, s19, s13
; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
+; GFX11-NEXT: s_and_b32 s1, s1, 1
+; GFX11-NEXT: s_cmp_lg_u32 s13, 0
; GFX11-NEXT: s_cselect_b32 s13, -1, 0
; GFX11-NEXT: s_and_b32 s20, s9, exec_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 6dc9199..8748aff 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -8265,10 +8265,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
+; GFX12-NEXT: s_lshl_b32 s7, 1, s3
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
-; GFX12-NEXT: s_lshl_b32 s3, 1, s3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
+; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB28_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
@@ -8349,13 +8351,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX942-NEXT: .LBB28_5: ; %ComputeLoop
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT: v_readfirstlane_b32 s6, v1
-; GFX942-NEXT: s_mov_b32 m0, s3
-; GFX942-NEXT: v_readlane_b32 s8, v2, s3
-; GFX942-NEXT: v_writelane_b32 v0, s6, m0
; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX942-NEXT: v_readfirstlane_b32 s8, v1
+; GFX942-NEXT: v_readlane_b32 s9, v2, s3
+; GFX942-NEXT: s_mov_b32 m0, s3
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX942-NEXT: v_writelane_b32 v0, s8, m0
+; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
; GFX942-NEXT: s_cbranch_scc1 .LBB28_5
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8437,14 +8440,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX11-NEXT: .LBB28_5: ; %ComputeLoop
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
+; GFX11-NEXT: s_lshl_b32 s7, 1, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT: s_lshl_b32 s1, 1, s1
-; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc1 .LBB28_5
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8524,10 +8528,11 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
+; GFX10-NEXT: s_lshl_b32 s7, 1, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s7
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT: s_lshl_b32 s1, 1, s1
-; GFX10-NEXT: s_andn2_b32 s0, s0, s1
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc1 .LBB28_5
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8604,13 +8609,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
-; GFX90A-NEXT: s_mov_b32 m0, s3
-; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
-; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
+; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT: s_mov_b32 m0, s3
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
+; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8686,13 +8692,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX908-NEXT: .LBB28_5: ; %ComputeLoop
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT: v_readfirstlane_b32 s6, v1
-; GFX908-NEXT: s_mov_b32 m0, s3
-; GFX908-NEXT: v_readlane_b32 s8, v2, s3
-; GFX908-NEXT: v_writelane_b32 v0, s6, m0
; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX908-NEXT: v_readfirstlane_b32 s8, v1
+; GFX908-NEXT: v_readlane_b32 s9, v2, s3
+; GFX908-NEXT: s_mov_b32 m0, s3
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX908-NEXT: v_writelane_b32 v0, s8, m0
+; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
; GFX908-NEXT: s_cbranch_scc1 .LBB28_5
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8769,13 +8776,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX8-NEXT: .LBB28_5: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: v_readfirstlane_b32 s6, v1
-; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v2, s3
-; GFX8-NEXT: v_writelane_b32 v0, s6, m0
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s8, v1
+; GFX8-NEXT: v_readlane_b32 s9, v2, s3
+; GFX8-NEXT: s_mov_b32 m0, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX8-NEXT: v_writelane_b32 v0, s8, m0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
; GFX8-NEXT: s_cbranch_scc1 .LBB28_5
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9122,10 +9130,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
+; GFX12-NEXT: s_lshl_b32 s7, 1, s3
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
-; GFX12-NEXT: s_lshl_b32 s3, 1, s3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
+; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB29_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
@@ -9202,13 +9212,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX942-NEXT: .LBB29_5: ; %ComputeLoop
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT: v_readfirstlane_b32 s6, v1
-; GFX942-NEXT: s_mov_b32 m0, s3
-; GFX942-NEXT: v_readlane_b32 s8, v2, s3
-; GFX942-NEXT: v_writelane_b32 v0, s6, m0
; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX942-NEXT: v_readfirstlane_b32 s8, v1
+; GFX942-NEXT: v_readlane_b32 s9, v2, s3
+; GFX942-NEXT: s_mov_b32 m0, s3
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX942-NEXT: v_writelane_b32 v0, s8, m0
+; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
; GFX942-NEXT: s_cbranch_scc1 .LBB29_5
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9285,14 +9296,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: .LBB29_5: ; %ComputeLoop
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
+; GFX11-NEXT: s_lshl_b32 s7, 1, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT: s_lshl_b32 s1, 1, s1
-; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc1 .LBB29_5
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9365,10 +9377,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
+; GFX10-NEXT: s_lshl_b32 s7, 1, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s7
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT: s_lshl_b32 s1, 1, s1
-; GFX10-NEXT: s_andn2_b32 s0, s0, s1
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc1 .LBB29_5
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9440,13 +9453,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
-; GFX90A-NEXT: s_mov_b32 m0, s3
-; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
-; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
+; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT: s_mov_b32 m0, s3
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
+; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9519,13 +9533,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX908-NEXT: .LBB29_5: ; %ComputeLoop
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT: v_readfirstlane_b32 s6, v1
-; GFX908-NEXT: s_mov_b32 m0, s3
-; GFX908-NEXT: v_readlane_b32 s8, v2, s3
-; GFX908-NEXT: v_writelane_b32 v0, s6, m0
; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX908-NEXT: v_readfirstlane_b32 s8, v1
+; GFX908-NEXT: v_readlane_b32 s9, v2, s3
+; GFX908-NEXT: s_mov_b32 m0, s3
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX908-NEXT: v_writelane_b32 v0, s8, m0
+; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
; GFX908-NEXT: s_cbranch_scc1 .LBB29_5
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9599,13 +9614,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: .LBB29_5: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: v_readfirstlane_b32 s6, v1
-; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v2, s3
-; GFX8-NEXT: v_writelane_b32 v0, s6, m0
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s8, v1
+; GFX8-NEXT: v_readlane_b32 s9, v2, s3
+; GFX8-NEXT: s_mov_b32 m0, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
+; GFX8-NEXT: v_writelane_b32 v0, s8, m0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
; GFX8-NEXT: s_cbranch_scc1 .LBB29_5
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index fba42c4..c1cf06e 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -388,8 +388,9 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
- ; GCN-NEXT: S_NOP 0, implicit $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit killed $scc
+ ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
@@ -416,80 +417,6 @@ body: |
S_ENDPGM 0
...
----
-name: xor_1_cmp_lg_0_killed_scc
-body: |
- ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc
- ; GCN: bb.0:
- ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc
- ; GCN-NEXT: S_NOP 0, implicit $scc
- ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
- ; GCN-NEXT: S_BRANCH %bb.1
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.1:
- ; GCN-NEXT: successors: %bb.2(0x80000000)
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.2:
- ; GCN-NEXT: S_ENDPGM 0
- bb.0:
- successors: %bb.1(0x40000000), %bb.2(0x40000000)
- liveins: $sgpr0, $vgpr0_vgpr1
-
- %0:sreg_32 = COPY $sgpr0
- %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc
- S_NOP 0, implicit killed $scc
- S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
- S_CBRANCH_SCC0 %bb.2, implicit $scc
- S_BRANCH %bb.1
-
- bb.1:
- successors: %bb.2(0x80000000)
-
- bb.2:
- S_ENDPGM 0
-
-...
----
-name: absdiff_1_cmp_lg_0_killed_scc
-body: |
- ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc
- ; GCN: bb.0:
- ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc
- ; GCN-NEXT: S_NOP 0, implicit $scc
- ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
- ; GCN-NEXT: S_BRANCH %bb.1
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.1:
- ; GCN-NEXT: successors: %bb.2(0x80000000)
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: bb.2:
- ; GCN-NEXT: S_ENDPGM 0
- bb.0:
- successors: %bb.1(0x40000000), %bb.2(0x40000000)
- liveins: $sgpr0, $vgpr0_vgpr1
-
- %0:sreg_32 = COPY $sgpr0
- %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc
- S_NOP 0, implicit killed $scc
- S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
- S_CBRANCH_SCC0 %bb.2, implicit $scc
- S_BRANCH %bb.1
-
- bb.1:
- successors: %bb.2(0x80000000)
-
- bb.2:
- S_ENDPGM 0
-
-...
---
name: and_1_cmp_eq_1_clobbered_scc
@@ -2143,7 +2070,8 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc
+ ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index 19cc7f7..f53aaaa 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -10,6 +10,7 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: shl32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -24,6 +25,7 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: shl64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -38,6 +40,7 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: lshr32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -52,6 +55,7 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: lshr64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -66,6 +70,7 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: ashr32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_ashr_i32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -80,6 +85,7 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: ashr64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -94,6 +100,7 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
; CHECK-LABEL: abs32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_abs_i32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -114,6 +121,7 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: and32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -128,6 +136,7 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: and64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -142,6 +151,7 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: or32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -156,6 +166,7 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: or64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -170,6 +181,7 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: xor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -184,6 +196,7 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: xor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -198,6 +211,7 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: nand32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nand_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -217,6 +231,7 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nand64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -236,6 +251,7 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: nor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -255,6 +271,7 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -274,6 +291,7 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: xnor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xnor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -293,6 +311,7 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: xnor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -312,6 +331,7 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: andn232:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_andn2_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -327,6 +347,7 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nandn264:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -342,6 +363,7 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: orn232:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_orn2_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -357,6 +379,7 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: orn264:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -372,6 +395,7 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
; CHECK-LABEL: bfe_i32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -409,6 +433,7 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
; CHECK-LABEL: bfe_u32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -488,6 +513,7 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
; CHECK-LABEL: bcnt132:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -526,6 +552,7 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
; CHECK-LABEL: quadmask32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_quadmask_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -544,6 +571,7 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
; CHECK-LABEL: quadmask64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -562,6 +590,7 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) {
; CHECK-LABEL: not32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_not_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -580,6 +609,7 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
; CHECK-LABEL: not64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_not_b64 s[0:1], s[0:1]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
index 7552f6b..a828ee0 100644
--- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
@@ -12,6 +12,8 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) {
; CHECK-LABEL: s_uaddo_pseudo:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_addc_u32 s0, 1, 0
; CHECK-NEXT: ; return to shader part epilog
%pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1)
@@ -30,6 +32,8 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: s_usubo_pseudo:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s0, s1, 0
; CHECK-NEXT: ; return to shader part epilog
%pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 71f5a94..5f6d622 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -56,9 +56,10 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_addc_u32 s15, 0, s16
; GCN-NEXT: s_add_u32 s16, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mul_hi_u32 v0, s12, v0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s12, v0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s14, s14, s15
; GCN-NEXT: s_mul_i32 s0, s12, s14
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -89,6 +90,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_add_u32 s15, s16, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s14, s14, s12
; GCN-NEXT: s_ashr_i32 s12, s7, 31
; GCN-NEXT: s_add_u32 s0, s6, s12
@@ -114,50 +116,52 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_addc_u32 s4, s4, 0
; GCN-NEXT: s_mul_i32 s14, s7, s14
-; GCN-NEXT: s_add_u32 s16, s1, s14
-; GCN-NEXT: v_mov_b32_e32 v0, s16
+; GCN-NEXT: s_add_u32 s14, s1, s14
+; GCN-NEXT: v_mov_b32_e32 v0, s14
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_addc_u32 s17, 0, s4
+; GCN-NEXT: s_addc_u32 s15, 0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mul_i32 s4, s10, s17
+; GCN-NEXT: s_mul_i32 s4, s10, s15
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s5, s11, s16
-; GCN-NEXT: s_add_i32 s18, s4, s5
-; GCN-NEXT: s_sub_i32 s14, s7, s18
-; GCN-NEXT: s_mul_i32 s4, s10, s16
+; GCN-NEXT: s_mul_i32 s5, s11, s14
+; GCN-NEXT: s_add_i32 s16, s4, s5
+; GCN-NEXT: s_sub_i32 s17, s7, s16
+; GCN-NEXT: s_mul_i32 s4, s10, s14
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s15, s4, s5
-; GCN-NEXT: s_subb_u32 s19, s14, s11
-; GCN-NEXT: s_sub_u32 s20, s6, s10
-; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_or_b32 s14, s14, s15
-; GCN-NEXT: s_subb_u32 s14, s19, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s11
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s20, s10
-; GCN-NEXT: s_cselect_b32 s19, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s11
-; GCN-NEXT: s_cselect_b32 s14, s19, s15
-; GCN-NEXT: s_add_u32 s15, s16, 1
-; GCN-NEXT: s_addc_u32 s19, s17, 0
-; GCN-NEXT: s_add_u32 s20, s16, 2
-; GCN-NEXT: s_addc_u32 s21, s17, 0
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_cselect_b32 s14, s20, s15
-; GCN-NEXT: s_cselect_b32 s15, s21, s19
+; GCN-NEXT: s_or_b32 s18, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s18, 0
+; GCN-NEXT: s_subb_u32 s17, s17, s11
+; GCN-NEXT: s_sub_u32 s19, s6, s10
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_subb_u32 s4, s7, s18
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s4, s17, 0
; GCN-NEXT: s_cmp_ge_u32 s4, s11
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s6, s10
-; GCN-NEXT: s_cselect_b32 s6, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s19, s10
+; GCN-NEXT: s_cselect_b32 s17, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s4, s11
-; GCN-NEXT: s_cselect_b32 s4, s6, s5
+; GCN-NEXT: s_cselect_b32 s4, s17, s5
+; GCN-NEXT: s_add_u32 s5, s14, 1
+; GCN-NEXT: s_addc_u32 s17, s15, 0
+; GCN-NEXT: s_add_u32 s19, s14, 2
+; GCN-NEXT: s_addc_u32 s20, s15, 0
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s5, s15, s17
-; GCN-NEXT: s_cselect_b32 s4, s14, s16
+; GCN-NEXT: s_cselect_b32 s4, s19, s5
+; GCN-NEXT: s_cselect_b32 s5, s20, s17
+; GCN-NEXT: s_cmp_lg_u32 s18, 0
+; GCN-NEXT: s_subb_u32 s7, s7, s16
+; GCN-NEXT: s_cmp_ge_u32 s7, s11
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s6, s10
+; GCN-NEXT: s_cselect_b32 s6, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s7, s11
+; GCN-NEXT: s_cselect_b32 s6, s6, s16
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-NEXT: s_cselect_b32 s5, s5, s15
+; GCN-NEXT: s_cselect_b32 s4, s4, s14
; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GCN-NEXT: s_sub_u32 s4, s4, s6
@@ -204,6 +208,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s18, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
+; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
; GCN-IR-NEXT: s_addc_u32 s10, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
@@ -237,6 +242,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
+; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9]
@@ -1189,9 +1195,10 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s12, 0, s13
; GCN-NEXT: s_add_u32 s13, s8, s9
; GCN-NEXT: v_mov_b32_e32 v0, s13
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s11, s11, s12
; GCN-NEXT: s_mul_i32 s8, s2, s11
; GCN-NEXT: v_readfirstlane_b32 s9, v0
@@ -1222,6 +1229,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s2, s13, s2
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s8, s11, s10
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s8, 24
@@ -1230,46 +1238,48 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_readfirstlane_b32 s10, v1
; GCN-NEXT: v_readfirstlane_b32 s9, v0
; GCN-NEXT: s_add_u32 s8, s10, s8
-; GCN-NEXT: s_addc_u32 s12, 0, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NEXT: s_addc_u32 s10, 0, s9
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_mul_i32 s8, s7, s12
+; GCN-NEXT: s_mul_i32 s8, s7, s10
; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s13, s9, s8
-; GCN-NEXT: s_sub_i32 s10, 0, s13
-; GCN-NEXT: s_mul_i32 s8, s6, s12
-; GCN-NEXT: s_sub_u32 s14, 24, s8
+; GCN-NEXT: s_add_i32 s11, s9, s8
+; GCN-NEXT: s_sub_i32 s12, 0, s11
+; GCN-NEXT: s_mul_i32 s8, s6, s10
+; GCN-NEXT: s_sub_u32 s13, 24, s8
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s14, s8, s9
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_subb_u32 s12, s12, s7
+; GCN-NEXT: s_sub_u32 s15, s13, s6
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s11, s8, s9
-; GCN-NEXT: s_subb_u32 s15, s10, s7
-; GCN-NEXT: s_sub_u32 s16, s14, s6
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s15, 0
-; GCN-NEXT: s_cmp_ge_u32 s10, s7
-; GCN-NEXT: s_cselect_b32 s11, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s16, s6
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s10, s7
-; GCN-NEXT: s_cselect_b32 s10, s15, s11
-; GCN-NEXT: s_add_u32 s11, s12, 1
-; GCN-NEXT: s_addc_u32 s15, 0, 0
-; GCN-NEXT: s_add_u32 s16, s12, 2
-; GCN-NEXT: s_addc_u32 s17, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_cselect_b32 s10, s16, s11
-; GCN-NEXT: s_cselect_b32 s11, s17, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, 0, s13
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_subb_u32 s8, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s8, s7
; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s6
-; GCN-NEXT: s_cselect_b32 s6, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s15, s6
+; GCN-NEXT: s_cselect_b32 s12, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s8, s7
-; GCN-NEXT: s_cselect_b32 s6, s6, s9
+; GCN-NEXT: s_cselect_b32 s8, s12, s9
+; GCN-NEXT: s_add_u32 s9, s10, 1
+; GCN-NEXT: s_addc_u32 s12, 0, 0
+; GCN-NEXT: s_add_u32 s15, s10, 2
+; GCN-NEXT: s_addc_u32 s16, 0, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s8, s15, s9
+; GCN-NEXT: s_cselect_b32 s9, s16, s12
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_subb_u32 s11, 0, s11
+; GCN-NEXT: s_cmp_ge_u32 s11, s7
+; GCN-NEXT: s_cselect_b32 s12, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s6
+; GCN-NEXT: s_cselect_b32 s6, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s11, s7
+; GCN-NEXT: s_cselect_b32 s6, s6, s12
; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s7, s11, 0
-; GCN-NEXT: s_cselect_b32 s6, s10, s12
+; GCN-NEXT: s_cselect_b32 s7, s9, 0
+; GCN-NEXT: s_cselect_b32 s6, s8, s10
; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_subb_u32 s7, s7, s4
@@ -1305,6 +1315,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s12, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
@@ -1337,6 +1348,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
+; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index e12e31b..bbd1793 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
; GCN-NEXT: s_sub_u32 s3, 0, s8
-; GCN-NEXT: s_subb_u32 s10, 0, s9
+; GCN-NEXT: s_subb_u32 s12, 0, s9
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1522,52 +1522,56 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s11, v1
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s13, s3, s11
-; GCN-NEXT: s_mul_hi_u32 s15, s3, s12
-; GCN-NEXT: s_mul_i32 s14, s10, s12
-; GCN-NEXT: s_add_i32 s13, s15, s13
-; GCN-NEXT: s_add_i32 s13, s13, s14
-; GCN-NEXT: s_mul_i32 s16, s3, s12
-; GCN-NEXT: s_mul_i32 s15, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s17, s12, s16
-; GCN-NEXT: s_mul_hi_u32 s14, s12, s13
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: v_readfirstlane_b32 s10, v0
+; GCN-NEXT: s_mul_i32 s11, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s15, s3, s10
+; GCN-NEXT: s_mul_i32 s14, s12, s10
+; GCN-NEXT: s_add_i32 s11, s15, s11
+; GCN-NEXT: s_add_i32 s11, s11, s14
+; GCN-NEXT: s_mul_i32 s16, s3, s10
+; GCN-NEXT: s_mul_i32 s15, s10, s11
+; GCN-NEXT: s_mul_hi_u32 s17, s10, s16
+; GCN-NEXT: s_mul_hi_u32 s14, s10, s11
; GCN-NEXT: s_add_u32 s15, s17, s15
; GCN-NEXT: s_addc_u32 s14, 0, s14
-; GCN-NEXT: s_mul_hi_u32 s18, s11, s16
-; GCN-NEXT: s_mul_i32 s16, s11, s16
+; GCN-NEXT: s_mul_hi_u32 s18, s13, s16
+; GCN-NEXT: s_mul_i32 s16, s13, s16
; GCN-NEXT: s_add_u32 s15, s15, s16
-; GCN-NEXT: s_mul_hi_u32 s17, s11, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s13, s11
; GCN-NEXT: s_addc_u32 s14, s14, s18
; GCN-NEXT: s_addc_u32 s15, s17, 0
-; GCN-NEXT: s_mul_i32 s13, s11, s13
-; GCN-NEXT: s_add_u32 s13, s14, s13
+; GCN-NEXT: s_mul_i32 s11, s13, s11
+; GCN-NEXT: s_add_u32 s11, s14, s11
; GCN-NEXT: s_addc_u32 s14, 0, s15
-; GCN-NEXT: s_add_u32 s12, s12, s13
-; GCN-NEXT: s_addc_u32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s13, s3, s11
-; GCN-NEXT: s_mul_hi_u32 s14, s3, s12
-; GCN-NEXT: s_add_i32 s13, s14, s13
-; GCN-NEXT: s_mul_i32 s10, s10, s12
-; GCN-NEXT: s_add_i32 s13, s13, s10
-; GCN-NEXT: s_mul_i32 s3, s3, s12
-; GCN-NEXT: s_mul_hi_u32 s14, s11, s3
-; GCN-NEXT: s_mul_i32 s15, s11, s3
-; GCN-NEXT: s_mul_i32 s17, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s3, s12, s3
-; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT: s_add_u32 s15, s10, s11
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GCN-NEXT: s_addc_u32 s13, s13, s14
+; GCN-NEXT: s_mul_i32 s10, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s11, s3, s15
+; GCN-NEXT: s_add_i32 s10, s11, s10
+; GCN-NEXT: s_mul_i32 s12, s12, s15
+; GCN-NEXT: s_add_i32 s10, s10, s12
+; GCN-NEXT: s_mul_i32 s3, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s12, s13, s3
+; GCN-NEXT: s_mul_i32 s14, s13, s3
+; GCN-NEXT: s_mul_i32 s17, s15, s10
+; GCN-NEXT: s_mul_hi_u32 s3, s15, s3
+; GCN-NEXT: s_mul_hi_u32 s16, s15, s10
; GCN-NEXT: s_add_u32 s3, s3, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_add_u32 s3, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s10, s11, s13
-; GCN-NEXT: s_addc_u32 s3, s16, s14
-; GCN-NEXT: s_addc_u32 s10, s10, 0
-; GCN-NEXT: s_mul_i32 s13, s11, s13
-; GCN-NEXT: s_add_u32 s3, s3, s13
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: s_add_u32 s3, s12, s3
-; GCN-NEXT: s_addc_u32 s14, s11, s10
+; GCN-NEXT: s_add_u32 s3, s3, s14
+; GCN-NEXT: s_mul_hi_u32 s11, s13, s10
+; GCN-NEXT: s_addc_u32 s3, s16, s12
+; GCN-NEXT: s_addc_u32 s11, s11, 0
+; GCN-NEXT: s_mul_i32 s10, s13, s10
+; GCN-NEXT: s_add_u32 s3, s3, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s11
+; GCN-NEXT: s_add_u32 s3, s15, s3
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
+; GCN-NEXT: s_addc_u32 s14, s13, s12
; GCN-NEXT: s_ashr_i32 s10, s5, 31
; GCN-NEXT: s_add_u32 s12, s4, s10
; GCN-NEXT: s_mov_b32 s11, s10
@@ -1596,9 +1600,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_mul_i32 s3, s8, s3
; GCN-NEXT: s_sub_u32 s3, s12, s3
; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s12, s16, s9
; GCN-NEXT: s_sub_u32 s18, s3, s8
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s19, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s19, s9
; GCN-NEXT: s_cselect_b32 s20, -1, 0
@@ -1608,10 +1614,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_cselect_b32 s20, s21, s20
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s12, s12, s9
-; GCN-NEXT: s_sub_u32 s16, s18, s8
+; GCN-NEXT: s_sub_u32 s21, s18, s8
+; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s12, s12, 0
; GCN-NEXT: s_cmp_lg_u32 s20, 0
-; GCN-NEXT: s_cselect_b32 s16, s16, s18
+; GCN-NEXT: s_cselect_b32 s16, s21, s18
; GCN-NEXT: s_cselect_b32 s12, s12, s19
; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s5, s13, s5
@@ -1923,9 +1931,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s3, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -1935,10 +1945,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
-; TONGA-NEXT: s_sub_u32 s16, s18, s6
+; TONGA-NEXT: s_sub_u32 s21, s18, s6
+; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s3, s3, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s16, s18
+; TONGA-NEXT: s_cselect_b32 s16, s21, s18
; TONGA-NEXT: s_cselect_b32 s3, s3, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s5, s13, s5
@@ -2718,7 +2730,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s9, 0, s6
-; GCN-NEXT: s_subb_u32 s14, 0, s7
+; GCN-NEXT: s_subb_u32 s16, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2727,52 +2739,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s15, v1
-; GCN-NEXT: v_readfirstlane_b32 s16, v0
-; GCN-NEXT: s_mul_i32 s17, s9, s15
-; GCN-NEXT: s_mul_hi_u32 s19, s9, s16
-; GCN-NEXT: s_mul_i32 s18, s14, s16
-; GCN-NEXT: s_add_i32 s17, s19, s17
-; GCN-NEXT: s_add_i32 s17, s17, s18
-; GCN-NEXT: s_mul_i32 s20, s9, s16
-; GCN-NEXT: s_mul_i32 s19, s16, s17
-; GCN-NEXT: s_mul_hi_u32 s21, s16, s20
-; GCN-NEXT: s_mul_hi_u32 s18, s16, s17
+; GCN-NEXT: v_readfirstlane_b32 s17, v1
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s15, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s19, s9, s14
+; GCN-NEXT: s_mul_i32 s18, s16, s14
+; GCN-NEXT: s_add_i32 s15, s19, s15
+; GCN-NEXT: s_add_i32 s15, s15, s18
+; GCN-NEXT: s_mul_i32 s20, s9, s14
+; GCN-NEXT: s_mul_i32 s19, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s21, s14, s20
+; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
; GCN-NEXT: s_add_u32 s19, s21, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_mul_hi_u32 s22, s15, s20
-; GCN-NEXT: s_mul_i32 s20, s15, s20
+; GCN-NEXT: s_mul_hi_u32 s22, s17, s20
+; GCN-NEXT: s_mul_i32 s20, s17, s20
; GCN-NEXT: s_add_u32 s19, s19, s20
-; GCN-NEXT: s_mul_hi_u32 s21, s15, s17
+; GCN-NEXT: s_mul_hi_u32 s21, s17, s15
; GCN-NEXT: s_addc_u32 s18, s18, s22
; GCN-NEXT: s_addc_u32 s19, s21, 0
-; GCN-NEXT: s_mul_i32 s17, s15, s17
-; GCN-NEXT: s_add_u32 s17, s18, s17
+; GCN-NEXT: s_mul_i32 s15, s17, s15
+; GCN-NEXT: s_add_u32 s15, s18, s15
; GCN-NEXT: s_addc_u32 s18, 0, s19
-; GCN-NEXT: s_add_u32 s16, s16, s17
-; GCN-NEXT: s_addc_u32 s15, s15, s18
-; GCN-NEXT: s_mul_i32 s17, s9, s15
-; GCN-NEXT: s_mul_hi_u32 s18, s9, s16
-; GCN-NEXT: s_add_i32 s17, s18, s17
-; GCN-NEXT: s_mul_i32 s14, s14, s16
-; GCN-NEXT: s_add_i32 s17, s17, s14
-; GCN-NEXT: s_mul_i32 s9, s9, s16
-; GCN-NEXT: s_mul_hi_u32 s18, s15, s9
-; GCN-NEXT: s_mul_i32 s19, s15, s9
-; GCN-NEXT: s_mul_i32 s21, s16, s17
-; GCN-NEXT: s_mul_hi_u32 s9, s16, s9
-; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
+; GCN-NEXT: s_add_u32 s19, s14, s15
+; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
+; GCN-NEXT: s_addc_u32 s17, s17, s18
+; GCN-NEXT: s_mul_i32 s14, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s15, s9, s19
+; GCN-NEXT: s_add_i32 s14, s15, s14
+; GCN-NEXT: s_mul_i32 s16, s16, s19
+; GCN-NEXT: s_add_i32 s14, s14, s16
+; GCN-NEXT: s_mul_i32 s9, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s16, s17, s9
+; GCN-NEXT: s_mul_i32 s18, s17, s9
+; GCN-NEXT: s_mul_i32 s21, s19, s14
+; GCN-NEXT: s_mul_hi_u32 s9, s19, s9
+; GCN-NEXT: s_mul_hi_u32 s20, s19, s14
; GCN-NEXT: s_add_u32 s9, s9, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_add_u32 s9, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s14, s15, s17
-; GCN-NEXT: s_addc_u32 s9, s20, s18
-; GCN-NEXT: s_addc_u32 s14, s14, 0
-; GCN-NEXT: s_mul_i32 s17, s15, s17
-; GCN-NEXT: s_add_u32 s9, s9, s17
-; GCN-NEXT: s_addc_u32 s14, 0, s14
-; GCN-NEXT: s_add_u32 s9, s16, s9
-; GCN-NEXT: s_addc_u32 s18, s15, s14
+; GCN-NEXT: s_add_u32 s9, s9, s18
+; GCN-NEXT: s_mul_hi_u32 s15, s17, s14
+; GCN-NEXT: s_addc_u32 s9, s20, s16
+; GCN-NEXT: s_addc_u32 s15, s15, 0
+; GCN-NEXT: s_mul_i32 s14, s17, s14
+; GCN-NEXT: s_add_u32 s9, s9, s14
+; GCN-NEXT: s_addc_u32 s16, 0, s15
+; GCN-NEXT: s_add_u32 s9, s19, s9
+; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
+; GCN-NEXT: s_addc_u32 s18, s17, s16
; GCN-NEXT: s_ashr_i32 s14, s11, 31
; GCN-NEXT: s_add_u32 s16, s10, s14
; GCN-NEXT: s_mov_b32 s15, s14
@@ -2801,9 +2817,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s6, s9
; GCN-NEXT: s_sub_u32 s9, s16, s9
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s16, s20, s7
; GCN-NEXT: s_sub_u32 s22, s9, s6
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s23, s16, 0
; GCN-NEXT: s_cmp_ge_u32 s23, s7
; GCN-NEXT: s_cselect_b32 s24, -1, 0
@@ -2813,10 +2831,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s24, s25, s24
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s16, s16, s7
-; GCN-NEXT: s_sub_u32 s20, s22, s6
+; GCN-NEXT: s_sub_u32 s25, s22, s6
+; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s16, s16, 0
; GCN-NEXT: s_cmp_lg_u32 s24, 0
-; GCN-NEXT: s_cselect_b32 s20, s20, s22
+; GCN-NEXT: s_cselect_b32 s20, s25, s22
; GCN-NEXT: s_cselect_b32 s16, s16, s23
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s11, s17, s11
@@ -2867,7 +2887,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s12, 0, s11
+; GCN-NEXT: s_subb_u32 s14, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2876,52 +2896,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
-; GCN-NEXT: s_mul_i32 s16, s12, s14
-; GCN-NEXT: s_add_i32 s15, s17, s15
-; GCN-NEXT: s_add_i32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s14
-; GCN-NEXT: s_mul_i32 s17, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s13, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
+; GCN-NEXT: s_mul_i32 s16, s14, s12
+; GCN-NEXT: s_add_i32 s13, s17, s13
+; GCN-NEXT: s_add_i32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s12
+; GCN-NEXT: s_mul_i32 s17, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
-; GCN-NEXT: s_mul_i32 s18, s13, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
+; GCN-NEXT: s_mul_i32 s18, s15, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s13, s15, s13
+; GCN-NEXT: s_add_u32 s13, s16, s13
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s14, s14, s15
-; GCN-NEXT: s_addc_u32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
-; GCN-NEXT: s_add_i32 s15, s16, s15
-; GCN-NEXT: s_mul_i32 s12, s12, s14
-; GCN-NEXT: s_add_i32 s15, s15, s12
-; GCN-NEXT: s_mul_i32 s3, s3, s14
-; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
-; GCN-NEXT: s_mul_i32 s17, s13, s3
-; GCN-NEXT: s_mul_i32 s19, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT: s_add_u32 s17, s12, s13
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s12, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s14, s14, s17
+; GCN-NEXT: s_add_i32 s12, s12, s14
+; GCN-NEXT: s_mul_i32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
+; GCN-NEXT: s_mul_i32 s16, s15, s3
+; GCN-NEXT: s_mul_i32 s19, s17, s12
+; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
-; GCN-NEXT: s_addc_u32 s3, s18, s16
-; GCN-NEXT: s_addc_u32 s12, s12, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s3, s3, s15
-; GCN-NEXT: s_addc_u32 s12, 0, s12
-; GCN-NEXT: s_add_u32 s3, s14, s3
-; GCN-NEXT: s_addc_u32 s16, s13, s12
+; GCN-NEXT: s_add_u32 s3, s3, s16
+; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
+; GCN-NEXT: s_addc_u32 s3, s18, s14
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s12, s15, s12
+; GCN-NEXT: s_add_u32 s3, s3, s12
+; GCN-NEXT: s_addc_u32 s14, 0, s13
+; GCN-NEXT: s_add_u32 s3, s17, s3
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s16, s15, s14
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -2950,9 +2974,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -2962,10 +2988,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s18, s20, s10
+; GCN-NEXT: s_sub_u32 s23, s20, s10
+; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s18, s20
+; GCN-NEXT: s_cselect_b32 s18, s23, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -3435,9 +3463,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -3447,10 +3477,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s16, s18, s6
+; TONGA-NEXT: s_sub_u32 s21, s18, s6
+; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s16, s18
+; TONGA-NEXT: s_cselect_b32 s16, s21, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
@@ -4902,7 +4934,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s17, 0, s6
-; GCN-NEXT: s_subb_u32 s22, 0, s7
+; GCN-NEXT: s_subb_u32 s24, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -4911,52 +4943,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s23, v1
-; GCN-NEXT: v_readfirstlane_b32 s24, v0
-; GCN-NEXT: s_mul_i32 s25, s17, s23
-; GCN-NEXT: s_mul_hi_u32 s27, s17, s24
-; GCN-NEXT: s_mul_i32 s26, s22, s24
-; GCN-NEXT: s_add_i32 s25, s27, s25
-; GCN-NEXT: s_add_i32 s25, s25, s26
-; GCN-NEXT: s_mul_i32 s28, s17, s24
-; GCN-NEXT: s_mul_i32 s27, s24, s25
-; GCN-NEXT: s_mul_hi_u32 s29, s24, s28
-; GCN-NEXT: s_mul_hi_u32 s26, s24, s25
+; GCN-NEXT: v_readfirstlane_b32 s25, v1
+; GCN-NEXT: v_readfirstlane_b32 s22, v0
+; GCN-NEXT: s_mul_i32 s23, s17, s25
+; GCN-NEXT: s_mul_hi_u32 s27, s17, s22
+; GCN-NEXT: s_mul_i32 s26, s24, s22
+; GCN-NEXT: s_add_i32 s23, s27, s23
+; GCN-NEXT: s_add_i32 s23, s23, s26
+; GCN-NEXT: s_mul_i32 s28, s17, s22
+; GCN-NEXT: s_mul_i32 s27, s22, s23
+; GCN-NEXT: s_mul_hi_u32 s29, s22, s28
+; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
; GCN-NEXT: s_add_u32 s27, s29, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_mul_hi_u32 s30, s23, s28
-; GCN-NEXT: s_mul_i32 s28, s23, s28
+; GCN-NEXT: s_mul_hi_u32 s30, s25, s28
+; GCN-NEXT: s_mul_i32 s28, s25, s28
; GCN-NEXT: s_add_u32 s27, s27, s28
-; GCN-NEXT: s_mul_hi_u32 s29, s23, s25
+; GCN-NEXT: s_mul_hi_u32 s29, s25, s23
; GCN-NEXT: s_addc_u32 s26, s26, s30
; GCN-NEXT: s_addc_u32 s27, s29, 0
-; GCN-NEXT: s_mul_i32 s25, s23, s25
-; GCN-NEXT: s_add_u32 s25, s26, s25
+; GCN-NEXT: s_mul_i32 s23, s25, s23
+; GCN-NEXT: s_add_u32 s23, s26, s23
; GCN-NEXT: s_addc_u32 s26, 0, s27
-; GCN-NEXT: s_add_u32 s24, s24, s25
-; GCN-NEXT: s_addc_u32 s23, s23, s26
-; GCN-NEXT: s_mul_i32 s25, s17, s23
-; GCN-NEXT: s_mul_hi_u32 s26, s17, s24
-; GCN-NEXT: s_add_i32 s25, s26, s25
-; GCN-NEXT: s_mul_i32 s22, s22, s24
-; GCN-NEXT: s_add_i32 s25, s25, s22
-; GCN-NEXT: s_mul_i32 s17, s17, s24
-; GCN-NEXT: s_mul_hi_u32 s26, s23, s17
-; GCN-NEXT: s_mul_i32 s27, s23, s17
-; GCN-NEXT: s_mul_i32 s29, s24, s25
-; GCN-NEXT: s_mul_hi_u32 s17, s24, s17
-; GCN-NEXT: s_mul_hi_u32 s28, s24, s25
+; GCN-NEXT: s_add_u32 s27, s22, s23
+; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT: s_addc_u32 s25, s25, s26
+; GCN-NEXT: s_mul_i32 s22, s17, s25
+; GCN-NEXT: s_mul_hi_u32 s23, s17, s27
+; GCN-NEXT: s_add_i32 s22, s23, s22
+; GCN-NEXT: s_mul_i32 s24, s24, s27
+; GCN-NEXT: s_add_i32 s22, s22, s24
+; GCN-NEXT: s_mul_i32 s17, s17, s27
+; GCN-NEXT: s_mul_hi_u32 s24, s25, s17
+; GCN-NEXT: s_mul_i32 s26, s25, s17
+; GCN-NEXT: s_mul_i32 s29, s27, s22
+; GCN-NEXT: s_mul_hi_u32 s17, s27, s17
+; GCN-NEXT: s_mul_hi_u32 s28, s27, s22
; GCN-NEXT: s_add_u32 s17, s17, s29
; GCN-NEXT: s_addc_u32 s28, 0, s28
-; GCN-NEXT: s_add_u32 s17, s17, s27
-; GCN-NEXT: s_mul_hi_u32 s22, s23, s25
-; GCN-NEXT: s_addc_u32 s17, s28, s26
-; GCN-NEXT: s_addc_u32 s22, s22, 0
-; GCN-NEXT: s_mul_i32 s25, s23, s25
-; GCN-NEXT: s_add_u32 s17, s17, s25
-; GCN-NEXT: s_addc_u32 s22, 0, s22
-; GCN-NEXT: s_add_u32 s17, s24, s17
-; GCN-NEXT: s_addc_u32 s26, s23, s22
+; GCN-NEXT: s_add_u32 s17, s17, s26
+; GCN-NEXT: s_mul_hi_u32 s23, s25, s22
+; GCN-NEXT: s_addc_u32 s17, s28, s24
+; GCN-NEXT: s_addc_u32 s23, s23, 0
+; GCN-NEXT: s_mul_i32 s22, s25, s22
+; GCN-NEXT: s_add_u32 s17, s17, s22
+; GCN-NEXT: s_addc_u32 s24, 0, s23
+; GCN-NEXT: s_add_u32 s17, s27, s17
+; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT: s_addc_u32 s26, s25, s24
; GCN-NEXT: s_ashr_i32 s22, s19, 31
; GCN-NEXT: s_add_u32 s24, s18, s22
; GCN-NEXT: s_mov_b32 s23, s22
@@ -4985,9 +5021,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s17, s6, s17
; GCN-NEXT: s_sub_u32 s17, s24, s17
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s24, s28, s7
; GCN-NEXT: s_sub_u32 s30, s17, s6
; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s31, s24, 0
; GCN-NEXT: s_cmp_ge_u32 s31, s7
; GCN-NEXT: s_cselect_b32 s33, -1, 0
@@ -4997,10 +5035,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s33, s34, s33
; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s24, s24, s7
-; GCN-NEXT: s_sub_u32 s28, s30, s6
+; GCN-NEXT: s_sub_u32 s34, s30, s6
+; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s24, s24, 0
; GCN-NEXT: s_cmp_lg_u32 s33, 0
-; GCN-NEXT: s_cselect_b32 s28, s28, s30
+; GCN-NEXT: s_cselect_b32 s28, s34, s30
; GCN-NEXT: s_cselect_b32 s24, s24, s31
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s19, s25, s19
@@ -5051,7 +5091,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19
; GCN-NEXT: s_sub_u32 s13, 0, s18
-; GCN-NEXT: s_subb_u32 s20, 0, s19
+; GCN-NEXT: s_subb_u32 s22, 0, s19
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5060,52 +5100,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s21, v1
-; GCN-NEXT: v_readfirstlane_b32 s22, v0
-; GCN-NEXT: s_mul_i32 s23, s13, s21
-; GCN-NEXT: s_mul_hi_u32 s25, s13, s22
-; GCN-NEXT: s_mul_i32 s24, s20, s22
-; GCN-NEXT: s_add_i32 s23, s25, s23
-; GCN-NEXT: s_add_i32 s23, s23, s24
-; GCN-NEXT: s_mul_i32 s26, s13, s22
-; GCN-NEXT: s_mul_i32 s25, s22, s23
-; GCN-NEXT: s_mul_hi_u32 s27, s22, s26
-; GCN-NEXT: s_mul_hi_u32 s24, s22, s23
+; GCN-NEXT: v_readfirstlane_b32 s23, v1
+; GCN-NEXT: v_readfirstlane_b32 s20, v0
+; GCN-NEXT: s_mul_i32 s21, s13, s23
+; GCN-NEXT: s_mul_hi_u32 s25, s13, s20
+; GCN-NEXT: s_mul_i32 s24, s22, s20
+; GCN-NEXT: s_add_i32 s21, s25, s21
+; GCN-NEXT: s_add_i32 s21, s21, s24
+; GCN-NEXT: s_mul_i32 s26, s13, s20
+; GCN-NEXT: s_mul_i32 s25, s20, s21
+; GCN-NEXT: s_mul_hi_u32 s27, s20, s26
+; GCN-NEXT: s_mul_hi_u32 s24, s20, s21
; GCN-NEXT: s_add_u32 s25, s27, s25
; GCN-NEXT: s_addc_u32 s24, 0, s24
-; GCN-NEXT: s_mul_hi_u32 s28, s21, s26
-; GCN-NEXT: s_mul_i32 s26, s21, s26
+; GCN-NEXT: s_mul_hi_u32 s28, s23, s26
+; GCN-NEXT: s_mul_i32 s26, s23, s26
; GCN-NEXT: s_add_u32 s25, s25, s26
-; GCN-NEXT: s_mul_hi_u32 s27, s21, s23
+; GCN-NEXT: s_mul_hi_u32 s27, s23, s21
; GCN-NEXT: s_addc_u32 s24, s24, s28
; GCN-NEXT: s_addc_u32 s25, s27, 0
-; GCN-NEXT: s_mul_i32 s23, s21, s23
-; GCN-NEXT: s_add_u32 s23, s24, s23
+; GCN-NEXT: s_mul_i32 s21, s23, s21
+; GCN-NEXT: s_add_u32 s21, s24, s21
; GCN-NEXT: s_addc_u32 s24, 0, s25
-; GCN-NEXT: s_add_u32 s22, s22, s23
-; GCN-NEXT: s_addc_u32 s21, s21, s24
-; GCN-NEXT: s_mul_i32 s23, s13, s21
-; GCN-NEXT: s_mul_hi_u32 s24, s13, s22
-; GCN-NEXT: s_add_i32 s23, s24, s23
-; GCN-NEXT: s_mul_i32 s20, s20, s22
-; GCN-NEXT: s_add_i32 s23, s23, s20
-; GCN-NEXT: s_mul_i32 s13, s13, s22
-; GCN-NEXT: s_mul_hi_u32 s24, s21, s13
-; GCN-NEXT: s_mul_i32 s25, s21, s13
-; GCN-NEXT: s_mul_i32 s27, s22, s23
-; GCN-NEXT: s_mul_hi_u32 s13, s22, s13
-; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
+; GCN-NEXT: s_add_u32 s25, s20, s21
+; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT: s_addc_u32 s23, s23, s24
+; GCN-NEXT: s_mul_i32 s20, s13, s23
+; GCN-NEXT: s_mul_hi_u32 s21, s13, s25
+; GCN-NEXT: s_add_i32 s20, s21, s20
+; GCN-NEXT: s_mul_i32 s22, s22, s25
+; GCN-NEXT: s_add_i32 s20, s20, s22
+; GCN-NEXT: s_mul_i32 s13, s13, s25
+; GCN-NEXT: s_mul_hi_u32 s22, s23, s13
+; GCN-NEXT: s_mul_i32 s24, s23, s13
+; GCN-NEXT: s_mul_i32 s27, s25, s20
+; GCN-NEXT: s_mul_hi_u32 s13, s25, s13
+; GCN-NEXT: s_mul_hi_u32 s26, s25, s20
; GCN-NEXT: s_add_u32 s13, s13, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_add_u32 s13, s13, s25
-; GCN-NEXT: s_mul_hi_u32 s20, s21, s23
-; GCN-NEXT: s_addc_u32 s13, s26, s24
-; GCN-NEXT: s_addc_u32 s20, s20, 0
-; GCN-NEXT: s_mul_i32 s23, s21, s23
-; GCN-NEXT: s_add_u32 s13, s13, s23
-; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_add_u32 s13, s22, s13
-; GCN-NEXT: s_addc_u32 s24, s21, s20
+; GCN-NEXT: s_add_u32 s13, s13, s24
+; GCN-NEXT: s_mul_hi_u32 s21, s23, s20
+; GCN-NEXT: s_addc_u32 s13, s26, s22
+; GCN-NEXT: s_addc_u32 s21, s21, 0
+; GCN-NEXT: s_mul_i32 s20, s23, s20
+; GCN-NEXT: s_add_u32 s13, s13, s20
+; GCN-NEXT: s_addc_u32 s22, 0, s21
+; GCN-NEXT: s_add_u32 s13, s25, s13
+; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT: s_addc_u32 s24, s23, s22
; GCN-NEXT: s_ashr_i32 s20, s15, 31
; GCN-NEXT: s_add_u32 s22, s14, s20
; GCN-NEXT: s_mov_b32 s21, s20
@@ -5134,9 +5178,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s13, s18, s13
; GCN-NEXT: s_sub_u32 s13, s22, s13
; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s22, s26, s19
; GCN-NEXT: s_sub_u32 s28, s13, s18
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s29, s22, 0
; GCN-NEXT: s_cmp_ge_u32 s29, s19
; GCN-NEXT: s_cselect_b32 s30, -1, 0
@@ -5146,10 +5192,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s30, s31, s30
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s22, s22, s19
-; GCN-NEXT: s_sub_u32 s26, s28, s18
+; GCN-NEXT: s_sub_u32 s31, s28, s18
+; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s22, s22, 0
; GCN-NEXT: s_cmp_lg_u32 s30, 0
-; GCN-NEXT: s_cselect_b32 s26, s26, s28
+; GCN-NEXT: s_cselect_b32 s26, s31, s28
; GCN-NEXT: s_cselect_b32 s22, s22, s29
; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s15, s23, s15
@@ -5209,7 +5257,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15
; GCN-NEXT: s_sub_u32 s9, 0, s14
-; GCN-NEXT: s_subb_u32 s16, 0, s15
+; GCN-NEXT: s_subb_u32 s18, 0, s15
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5218,52 +5266,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s17, v1
-; GCN-NEXT: v_readfirstlane_b32 s18, v0
-; GCN-NEXT: s_mul_i32 s19, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s21, s9, s18
-; GCN-NEXT: s_mul_i32 s20, s16, s18
-; GCN-NEXT: s_add_i32 s19, s21, s19
-; GCN-NEXT: s_add_i32 s19, s19, s20
-; GCN-NEXT: s_mul_i32 s22, s9, s18
-; GCN-NEXT: s_mul_i32 s21, s18, s19
-; GCN-NEXT: s_mul_hi_u32 s23, s18, s22
-; GCN-NEXT: s_mul_hi_u32 s20, s18, s19
+; GCN-NEXT: v_readfirstlane_b32 s19, v1
+; GCN-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-NEXT: s_mul_i32 s17, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s21, s9, s16
+; GCN-NEXT: s_mul_i32 s20, s18, s16
+; GCN-NEXT: s_add_i32 s17, s21, s17
+; GCN-NEXT: s_add_i32 s17, s17, s20
+; GCN-NEXT: s_mul_i32 s22, s9, s16
+; GCN-NEXT: s_mul_i32 s21, s16, s17
+; GCN-NEXT: s_mul_hi_u32 s23, s16, s22
+; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
; GCN-NEXT: s_add_u32 s21, s23, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_mul_hi_u32 s24, s17, s22
-; GCN-NEXT: s_mul_i32 s22, s17, s22
+; GCN-NEXT: s_mul_hi_u32 s24, s19, s22
+; GCN-NEXT: s_mul_i32 s22, s19, s22
; GCN-NEXT: s_add_u32 s21, s21, s22
-; GCN-NEXT: s_mul_hi_u32 s23, s17, s19
+; GCN-NEXT: s_mul_hi_u32 s23, s19, s17
; GCN-NEXT: s_addc_u32 s20, s20, s24
; GCN-NEXT: s_addc_u32 s21, s23, 0
-; GCN-NEXT: s_mul_i32 s19, s17, s19
-; GCN-NEXT: s_add_u32 s19, s20, s19
+; GCN-NEXT: s_mul_i32 s17, s19, s17
+; GCN-NEXT: s_add_u32 s17, s20, s17
; GCN-NEXT: s_addc_u32 s20, 0, s21
-; GCN-NEXT: s_add_u32 s18, s18, s19
-; GCN-NEXT: s_addc_u32 s17, s17, s20
-; GCN-NEXT: s_mul_i32 s19, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s20, s9, s18
-; GCN-NEXT: s_add_i32 s19, s20, s19
-; GCN-NEXT: s_mul_i32 s16, s16, s18
-; GCN-NEXT: s_add_i32 s19, s19, s16
-; GCN-NEXT: s_mul_i32 s9, s9, s18
-; GCN-NEXT: s_mul_hi_u32 s20, s17, s9
-; GCN-NEXT: s_mul_i32 s21, s17, s9
-; GCN-NEXT: s_mul_i32 s23, s18, s19
-; GCN-NEXT: s_mul_hi_u32 s9, s18, s9
-; GCN-NEXT: s_mul_hi_u32 s22, s18, s19
+; GCN-NEXT: s_add_u32 s21, s16, s17
+; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT: s_addc_u32 s19, s19, s20
+; GCN-NEXT: s_mul_i32 s16, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s17, s9, s21
+; GCN-NEXT: s_add_i32 s16, s17, s16
+; GCN-NEXT: s_mul_i32 s18, s18, s21
+; GCN-NEXT: s_add_i32 s16, s16, s18
+; GCN-NEXT: s_mul_i32 s9, s9, s21
+; GCN-NEXT: s_mul_hi_u32 s18, s19, s9
+; GCN-NEXT: s_mul_i32 s20, s19, s9
+; GCN-NEXT: s_mul_i32 s23, s21, s16
+; GCN-NEXT: s_mul_hi_u32 s9, s21, s9
+; GCN-NEXT: s_mul_hi_u32 s22, s21, s16
; GCN-NEXT: s_add_u32 s9, s9, s23
; GCN-NEXT: s_addc_u32 s22, 0, s22
-; GCN-NEXT: s_add_u32 s9, s9, s21
-; GCN-NEXT: s_mul_hi_u32 s16, s17, s19
-; GCN-NEXT: s_addc_u32 s9, s22, s20
-; GCN-NEXT: s_addc_u32 s16, s16, 0
-; GCN-NEXT: s_mul_i32 s19, s17, s19
-; GCN-NEXT: s_add_u32 s9, s9, s19
-; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_add_u32 s9, s18, s9
-; GCN-NEXT: s_addc_u32 s20, s17, s16
+; GCN-NEXT: s_add_u32 s9, s9, s20
+; GCN-NEXT: s_mul_hi_u32 s17, s19, s16
+; GCN-NEXT: s_addc_u32 s9, s22, s18
+; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_mul_i32 s16, s19, s16
+; GCN-NEXT: s_add_u32 s9, s9, s16
+; GCN-NEXT: s_addc_u32 s18, 0, s17
+; GCN-NEXT: s_add_u32 s9, s21, s9
+; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT: s_addc_u32 s20, s19, s18
; GCN-NEXT: s_ashr_i32 s16, s11, 31
; GCN-NEXT: s_add_u32 s18, s10, s16
; GCN-NEXT: s_mov_b32 s17, s16
@@ -5292,9 +5344,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s14, s9
; GCN-NEXT: s_sub_u32 s9, s18, s9
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s18, s22, s15
; GCN-NEXT: s_sub_u32 s24, s9, s14
; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s25, s18, 0
; GCN-NEXT: s_cmp_ge_u32 s25, s15
; GCN-NEXT: s_cselect_b32 s26, -1, 0
@@ -5304,10 +5358,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s26, s27, s26
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s18, s18, s15
-; GCN-NEXT: s_sub_u32 s22, s24, s14
+; GCN-NEXT: s_sub_u32 s27, s24, s14
+; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s18, s18, 0
; GCN-NEXT: s_cmp_lg_u32 s26, 0
-; GCN-NEXT: s_cselect_b32 s22, s22, s24
+; GCN-NEXT: s_cselect_b32 s22, s27, s24
; GCN-NEXT: s_cselect_b32 s18, s18, s25
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s11, s19, s11
@@ -5364,7 +5420,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s12, 0, s11
+; GCN-NEXT: s_subb_u32 s14, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5373,52 +5429,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
-; GCN-NEXT: s_mul_i32 s16, s12, s14
-; GCN-NEXT: s_add_i32 s15, s17, s15
-; GCN-NEXT: s_add_i32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s14
-; GCN-NEXT: s_mul_i32 s17, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s13, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
+; GCN-NEXT: s_mul_i32 s16, s14, s12
+; GCN-NEXT: s_add_i32 s13, s17, s13
+; GCN-NEXT: s_add_i32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s12
+; GCN-NEXT: s_mul_i32 s17, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
-; GCN-NEXT: s_mul_i32 s18, s13, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
+; GCN-NEXT: s_mul_i32 s18, s15, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s13, s15, s13
+; GCN-NEXT: s_add_u32 s13, s16, s13
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s14, s14, s15
-; GCN-NEXT: s_addc_u32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s15, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
-; GCN-NEXT: s_add_i32 s15, s16, s15
-; GCN-NEXT: s_mul_i32 s12, s12, s14
-; GCN-NEXT: s_add_i32 s15, s15, s12
-; GCN-NEXT: s_mul_i32 s3, s3, s14
-; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
-; GCN-NEXT: s_mul_i32 s17, s13, s3
-; GCN-NEXT: s_mul_i32 s19, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT: s_add_u32 s17, s12, s13
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s12, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s14, s14, s17
+; GCN-NEXT: s_add_i32 s12, s12, s14
+; GCN-NEXT: s_mul_i32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
+; GCN-NEXT: s_mul_i32 s16, s15, s3
+; GCN-NEXT: s_mul_i32 s19, s17, s12
+; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
-; GCN-NEXT: s_addc_u32 s3, s18, s16
-; GCN-NEXT: s_addc_u32 s12, s12, 0
-; GCN-NEXT: s_mul_i32 s15, s13, s15
-; GCN-NEXT: s_add_u32 s3, s3, s15
-; GCN-NEXT: s_addc_u32 s12, 0, s12
-; GCN-NEXT: s_add_u32 s3, s14, s3
-; GCN-NEXT: s_addc_u32 s16, s13, s12
+; GCN-NEXT: s_add_u32 s3, s3, s16
+; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
+; GCN-NEXT: s_addc_u32 s3, s18, s14
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s12, s15, s12
+; GCN-NEXT: s_add_u32 s3, s3, s12
+; GCN-NEXT: s_addc_u32 s14, 0, s13
+; GCN-NEXT: s_add_u32 s3, s17, s3
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
+; GCN-NEXT: s_addc_u32 s16, s15, s14
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -5447,9 +5507,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -5459,10 +5521,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s18, s20, s10
+; GCN-NEXT: s_sub_u32 s23, s20, s10
+; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s18, s20
+; GCN-NEXT: s_cselect_b32 s18, s23, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -6235,9 +6299,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v8
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -6247,10 +6313,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s16, s18, s6
+; TONGA-NEXT: s_sub_u32 s21, s18, s6
+; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
+; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s16, s18
+; TONGA-NEXT: s_cselect_b32 s16, s21, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ea9bb04..33b0a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -51,9 +51,10 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s0, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -84,6 +85,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_add_u32 s11, s14, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s1, s12, s10
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
@@ -113,43 +115,46 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: s_add_i32 s5, s10, s5
; GCN-NEXT: s_mul_i32 s10, s9, s4
-; GCN-NEXT: s_add_i32 s12, s5, s10
-; GCN-NEXT: s_sub_i32 s10, s7, s12
+; GCN-NEXT: s_add_i32 s10, s5, s10
+; GCN-NEXT: s_sub_i32 s11, s7, s10
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s11, s4, s5
-; GCN-NEXT: s_subb_u32 s13, s10, s9
-; GCN-NEXT: s_sub_u32 s14, s6, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s15, s10, s11
-; GCN-NEXT: s_subb_u32 s15, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s15, s9
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s8
-; GCN-NEXT: s_cselect_b32 s17, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s15, s9
-; GCN-NEXT: s_cselect_b32 s16, s17, s16
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s13, s13, s9
-; GCN-NEXT: s_sub_u32 s17, s14, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s13, 0
-; GCN-NEXT: s_cmp_lg_u32 s16, 0
-; GCN-NEXT: s_cselect_b32 s11, s17, s14
-; GCN-NEXT: s_cselect_b32 s10, s10, s15
+; GCN-NEXT: s_or_b32 s12, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s11, s11, s9
+; GCN-NEXT: s_sub_u32 s13, s6, s8
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_subb_u32 s4, s7, s12
-; GCN-NEXT: s_cmp_ge_u32 s4, s9
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s14, s11, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s9
; GCN-NEXT: s_cselect_b32 s5, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s8
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s9
+; GCN-NEXT: s_cselect_b32 s15, s15, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s11, s11, s9
+; GCN-NEXT: s_sub_u32 s16, s13, s8
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s4, s11, 0
+; GCN-NEXT: s_cmp_lg_u32 s15, 0
+; GCN-NEXT: s_cselect_b32 s5, s16, s13
+; GCN-NEXT: s_cselect_b32 s4, s4, s14
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s7, s7, s10
+; GCN-NEXT: s_cmp_ge_u32 s7, s9
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s8
-; GCN-NEXT: s_cselect_b32 s7, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s4, s9
-; GCN-NEXT: s_cselect_b32 s5, s7, s5
-; GCN-NEXT: s_cmp_lg_u32 s5, 0
-; GCN-NEXT: s_cselect_b32 s4, s10, s4
-; GCN-NEXT: s_cselect_b32 s5, s11, s6
+; GCN-NEXT: s_cselect_b32 s8, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s7, s9
+; GCN-NEXT: s_cselect_b32 s8, s8, s10
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s4, s4, s7
+; GCN-NEXT: s_cselect_b32 s5, s5, s6
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -182,6 +187,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -215,6 +221,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
+; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -1009,9 +1016,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s8, s9
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s8, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s9, v0
@@ -1042,6 +1050,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_add_u32 s11, s14, s8
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s10, s12, s10
; GCN-NEXT: s_ashr_i32 s8, s7, 31
; GCN-NEXT: s_add_u32 s6, s6, s8
@@ -1074,43 +1083,46 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: v_readfirstlane_b32 s12, v0
; GCN-NEXT: s_add_i32 s11, s12, s11
; GCN-NEXT: s_mul_i32 s12, s5, s10
-; GCN-NEXT: s_add_i32 s14, s11, s12
-; GCN-NEXT: s_sub_i32 s12, s7, s14
+; GCN-NEXT: s_add_i32 s12, s11, s12
+; GCN-NEXT: s_sub_i32 s13, s7, s12
; GCN-NEXT: s_mul_i32 s10, s4, s10
; GCN-NEXT: s_sub_u32 s6, s6, s10
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s13, s10, s11
-; GCN-NEXT: s_subb_u32 s15, s12, s5
-; GCN-NEXT: s_sub_u32 s16, s6, s4
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_or_b32 s17, s12, s13
-; GCN-NEXT: s_subb_u32 s17, s15, 0
-; GCN-NEXT: s_cmp_ge_u32 s17, s5
-; GCN-NEXT: s_cselect_b32 s18, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s16, s4
-; GCN-NEXT: s_cselect_b32 s19, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s17, s5
-; GCN-NEXT: s_cselect_b32 s18, s19, s18
-; GCN-NEXT: s_or_b32 s12, s12, s13
-; GCN-NEXT: s_subb_u32 s15, s15, s5
-; GCN-NEXT: s_sub_u32 s19, s16, s4
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_or_b32 s12, s12, s13
-; GCN-NEXT: s_subb_u32 s12, s15, 0
-; GCN-NEXT: s_cmp_lg_u32 s18, 0
-; GCN-NEXT: s_cselect_b32 s13, s19, s16
-; GCN-NEXT: s_cselect_b32 s12, s12, s17
+; GCN-NEXT: s_or_b32 s14, s10, s11
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_subb_u32 s13, s13, s5
+; GCN-NEXT: s_sub_u32 s15, s6, s4
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_subb_u32 s16, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s16, s5
+; GCN-NEXT: s_cselect_b32 s11, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s15, s4
+; GCN-NEXT: s_cselect_b32 s17, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s16, s5
+; GCN-NEXT: s_cselect_b32 s17, s17, s11
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_subb_u32 s13, s13, s5
+; GCN-NEXT: s_sub_u32 s18, s15, s4
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s7, s7, s14
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_cmp_lg_u32 s17, 0
+; GCN-NEXT: s_cselect_b32 s11, s18, s15
+; GCN-NEXT: s_cselect_b32 s10, s10, s16
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_subb_u32 s7, s7, s12
; GCN-NEXT: s_cmp_ge_u32 s7, s5
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
+; GCN-NEXT: s_cselect_b32 s12, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s4
; GCN-NEXT: s_cselect_b32 s4, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s7, s5
-; GCN-NEXT: s_cselect_b32 s4, s4, s10
+; GCN-NEXT: s_cselect_b32 s4, s4, s12
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s5, s12, s7
-; GCN-NEXT: s_cselect_b32 s4, s13, s6
+; GCN-NEXT: s_cselect_b32 s5, s10, s7
+; GCN-NEXT: s_cselect_b32 s4, s11, s6
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
; GCN-NEXT: s_sub_u32 s4, s4, s8
; GCN-NEXT: s_subb_u32 s5, s5, s8
@@ -1158,6 +1170,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_add_u32 s16, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
+; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
; GCN-IR-NEXT: s_addc_u32 s10, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
@@ -1191,6 +1204,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_add_u32 s18, s18, 1
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
+; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
; GCN-IR-NEXT: s_addc_u32 s19, s19, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3]
@@ -1355,9 +1369,10 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s6, s7
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_or_b32 s6, s6, s7
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s6, s2, s9
; GCN-NEXT: v_readfirstlane_b32 s7, v0
@@ -1388,6 +1403,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s2, s11, s2
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: s_or_b32 s6, s6, s7
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_addc_u32 s6, s9, s8
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s6, 24
@@ -1402,42 +1418,45 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mul_i32 s7, s5, s6
; GCN-NEXT: s_mul_i32 s6, s4, s6
; GCN-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NEXT: s_add_i32 s10, s8, s7
-; GCN-NEXT: s_sub_i32 s8, 0, s10
-; GCN-NEXT: s_sub_u32 s11, 24, s6
+; GCN-NEXT: s_add_i32 s8, s8, s7
+; GCN-NEXT: s_sub_i32 s9, 0, s8
+; GCN-NEXT: s_sub_u32 s10, 24, s6
+; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT: s_or_b32 s11, s6, s7
+; GCN-NEXT: s_cmp_lg_u32 s11, 0
+; GCN-NEXT: s_subb_u32 s9, s9, s5
+; GCN-NEXT: s_sub_u32 s12, s10, s4
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s9, s6, s7
-; GCN-NEXT: s_subb_u32 s12, s8, s5
-; GCN-NEXT: s_sub_u32 s13, s11, s4
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
-; GCN-NEXT: s_subb_u32 s14, s12, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s5
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s4
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s5
-; GCN-NEXT: s_cselect_b32 s15, s16, s15
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s12, s12, s5
-; GCN-NEXT: s_sub_u32 s16, s13, s4
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s12, 0
-; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s9, s16, s13
-; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_subb_u32 s6, 0, s10
-; GCN-NEXT: s_cmp_ge_u32 s6, s5
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-NEXT: s_subb_u32 s13, s9, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s5
; GCN-NEXT: s_cselect_b32 s7, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s11, s4
+; GCN-NEXT: s_cmp_ge_u32 s12, s4
+; GCN-NEXT: s_cselect_b32 s14, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s13, s5
+; GCN-NEXT: s_cselect_b32 s14, s14, s7
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-NEXT: s_subb_u32 s9, s9, s5
+; GCN-NEXT: s_sub_u32 s15, s12, s4
+; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT: s_or_b32 s6, s6, s7
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-NEXT: s_subb_u32 s6, s9, 0
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_cselect_b32 s7, s15, s12
+; GCN-NEXT: s_cselect_b32 s6, s6, s13
+; GCN-NEXT: s_cmp_lg_u32 s11, 0
+; GCN-NEXT: s_subb_u32 s8, 0, s8
+; GCN-NEXT: s_cmp_ge_u32 s8, s5
+; GCN-NEXT: s_cselect_b32 s9, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s10, s4
; GCN-NEXT: s_cselect_b32 s4, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s6, s5
-; GCN-NEXT: s_cselect_b32 s4, s4, s7
+; GCN-NEXT: s_cmp_eq_u32 s8, s5
+; GCN-NEXT: s_cselect_b32 s4, s4, s9
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s4, s8, s6
-; GCN-NEXT: s_cselect_b32 s5, s9, s11
+; GCN-NEXT: s_cselect_b32 s4, s6, s8
+; GCN-NEXT: s_cselect_b32 s5, s7, s10
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1470,6 +1489,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s8, s2, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s9, s10, s11
+; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s2, 63, s2
@@ -1502,6 +1522,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bdd22f25..bb5918b2 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -18,6 +18,7 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_addc_u32 s3, s3, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -34,9 +35,11 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s2, s2, s4
-; VI-NEXT: s_addc_u32 s3, s3, s5
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_addc_u32 s3, s3, s5
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
@@ -50,12 +53,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s4, s2, s6
-; GFX9-NEXT: s_addc_u32 s5, s3, s7
+; GFX9-NEXT: s_add_u32 s6, s2, s6
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_addc_u32 s4, s3, s7
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -68,6 +73,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s2, s2, s6
+; GFX10-NEXT: s_cselect_b32 s4, -1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s3, s3, s7
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -84,12 +91,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, s4
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_addc_u32 s3, s3, s5
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -435,6 +444,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_add_u32 s4, s4, s6
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
; SI-NEXT: s_or_b32 s6, s12, s13
+; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_addc_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -455,14 +465,16 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_addc_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -474,10 +486,12 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s12, s14
-; GFX9-NEXT: s_addc_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_add_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_addc_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -490,8 +504,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s12, s14
-; GFX10-NEXT: s_addc_u32 s1, s13, s15
+; GFX10-NEXT: s_cselect_b32 s1, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-NEXT: s_addc_u32 s1, s13, s15
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -504,8 +520,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_addc_u32 s5, s5, s7
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index fd461ac..41199b0 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -148,6 +148,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -181,6 +182,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5]
@@ -829,9 +831,10 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s4, s6, s9
; GCN-NEXT: v_readfirstlane_b32 s5, v0
@@ -862,6 +865,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s8, s11, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s4, s9, s6
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
@@ -870,50 +874,52 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_u32 s4, s8, s4
-; GCN-NEXT: s_addc_u32 s10, 0, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NEXT: s_addc_u32 s8, 0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mul_i32 s0, s3, s10
+; GCN-NEXT: s_mul_i32 s0, s3, s8
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s11, s1, s0
-; GCN-NEXT: s_sub_i32 s8, 0, s11
-; GCN-NEXT: s_mul_i32 s0, s2, s10
-; GCN-NEXT: s_sub_u32 s12, 24, s0
+; GCN-NEXT: s_add_i32 s9, s1, s0
+; GCN-NEXT: s_sub_i32 s10, 0, s9
+; GCN-NEXT: s_mul_i32 s0, s2, s8
+; GCN-NEXT: s_sub_u32 s11, 24, s0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s12, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s10, s10, s3
+; GCN-NEXT: s_sub_u32 s13, s11, s2
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s9, s0, s1
-; GCN-NEXT: s_subb_u32 s13, s8, s3
-; GCN-NEXT: s_sub_u32 s14, s12, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s8, s3
-; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s2
-; GCN-NEXT: s_cselect_b32 s13, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s8, s3
-; GCN-NEXT: s_cselect_b32 s8, s13, s9
-; GCN-NEXT: s_add_u32 s9, s10, 1
-; GCN-NEXT: s_addc_u32 s13, 0, 0
-; GCN-NEXT: s_add_u32 s14, s10, 2
-; GCN-NEXT: s_addc_u32 s15, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s8, s14, s9
-; GCN-NEXT: s_cselect_b32 s9, s15, s13
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_subb_u32 s0, 0, s11
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_subb_u32 s0, s10, 0
; GCN-NEXT: s_cmp_ge_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s12, s2
-; GCN-NEXT: s_cselect_b32 s2, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s2
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s0, s3
-; GCN-NEXT: s_cselect_b32 s0, s2, s1
+; GCN-NEXT: s_cselect_b32 s0, s10, s1
+; GCN-NEXT: s_add_u32 s1, s8, 1
+; GCN-NEXT: s_addc_u32 s10, 0, 0
+; GCN-NEXT: s_add_u32 s13, s8, 2
+; GCN-NEXT: s_addc_u32 s14, 0, 0
; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_cselect_b32 s0, s9, 0
-; GCN-NEXT: s_cselect_b32 s1, s8, s10
-; GCN-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: s_cselect_b32 s0, s13, s1
+; GCN-NEXT: s_cselect_b32 s1, s14, s10
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s9, 0, s9
+; GCN-NEXT: s_cmp_ge_u32 s9, s3
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s11, s2
+; GCN-NEXT: s_cselect_b32 s2, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s9, s3
+; GCN-NEXT: s_cselect_b32 s2, s2, s10
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cselect_b32 s1, s1, 0
+; GCN-NEXT: s_cselect_b32 s0, s0, s8
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
@@ -939,6 +945,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -971,6 +978,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1309,6 +1317,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1338,6 +1347,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_or_b32 s12, s12, s13
+; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 137dc1f..cdcc914 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -51,9 +51,10 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s0, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -84,6 +85,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_add_u32 s11, s14, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s1, s12, s10
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
@@ -113,43 +115,46 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: s_add_i32 s5, s10, s5
; GCN-NEXT: s_mul_i32 s10, s9, s4
-; GCN-NEXT: s_add_i32 s12, s5, s10
-; GCN-NEXT: s_sub_i32 s10, s7, s12
+; GCN-NEXT: s_add_i32 s10, s5, s10
+; GCN-NEXT: s_sub_i32 s11, s7, s10
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s11, s4, s5
-; GCN-NEXT: s_subb_u32 s13, s10, s9
-; GCN-NEXT: s_sub_u32 s14, s6, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s15, s10, s11
-; GCN-NEXT: s_subb_u32 s15, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s15, s9
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s8
-; GCN-NEXT: s_cselect_b32 s17, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s15, s9
-; GCN-NEXT: s_cselect_b32 s16, s17, s16
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s13, s13, s9
-; GCN-NEXT: s_sub_u32 s17, s14, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s13, 0
-; GCN-NEXT: s_cmp_lg_u32 s16, 0
-; GCN-NEXT: s_cselect_b32 s11, s17, s14
-; GCN-NEXT: s_cselect_b32 s10, s10, s15
+; GCN-NEXT: s_or_b32 s12, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s11, s11, s9
+; GCN-NEXT: s_sub_u32 s13, s6, s8
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_subb_u32 s4, s7, s12
-; GCN-NEXT: s_cmp_ge_u32 s4, s9
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s14, s11, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s9
; GCN-NEXT: s_cselect_b32 s5, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s8
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s9
+; GCN-NEXT: s_cselect_b32 s15, s15, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s11, s11, s9
+; GCN-NEXT: s_sub_u32 s16, s13, s8
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_subb_u32 s4, s11, 0
+; GCN-NEXT: s_cmp_lg_u32 s15, 0
+; GCN-NEXT: s_cselect_b32 s5, s16, s13
+; GCN-NEXT: s_cselect_b32 s4, s4, s14
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_subb_u32 s7, s7, s10
+; GCN-NEXT: s_cmp_ge_u32 s7, s9
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s8
-; GCN-NEXT: s_cselect_b32 s7, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s4, s9
-; GCN-NEXT: s_cselect_b32 s5, s7, s5
-; GCN-NEXT: s_cmp_lg_u32 s5, 0
-; GCN-NEXT: s_cselect_b32 s4, s10, s4
-; GCN-NEXT: s_cselect_b32 s5, s11, s6
+; GCN-NEXT: s_cselect_b32 s8, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s7, s9
+; GCN-NEXT: s_cselect_b32 s8, s8, s10
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s4, s4, s7
+; GCN-NEXT: s_cselect_b32 s5, s5, s6
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -182,6 +187,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
+; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -215,6 +221,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
+; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -846,9 +853,10 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s4, s6, s9
; GCN-NEXT: v_readfirstlane_b32 s5, v0
@@ -879,6 +887,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s8, s11, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s4, s9, s6
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
@@ -894,43 +903,46 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_mul_i32 s0, s3, s8
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s10, s1, s0
-; GCN-NEXT: s_sub_i32 s9, 0, s10
+; GCN-NEXT: s_add_i32 s9, s1, s0
+; GCN-NEXT: s_sub_i32 s10, 0, s9
; GCN-NEXT: s_mul_i32 s0, s2, s8
-; GCN-NEXT: s_sub_u32 s11, 24, s0
+; GCN-NEXT: s_sub_u32 s8, 24, s0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s11, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s11, 0
+; GCN-NEXT: s_subb_u32 s10, s10, s3
+; GCN-NEXT: s_sub_u32 s12, s8, s2
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s8, s0, s1
-; GCN-NEXT: s_subb_u32 s12, s9, s3
-; GCN-NEXT: s_sub_u32 s13, s11, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
-; GCN-NEXT: s_subb_u32 s14, s12, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s3
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s2
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s3
-; GCN-NEXT: s_cselect_b32 s15, s16, s15
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s12, s12, s3
-; GCN-NEXT: s_sub_u32 s16, s13, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s12, 0
-; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s9, s16, s13
-; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_subb_u32 s0, 0, s10
-; GCN-NEXT: s_cmp_ge_u32 s0, s3
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_subb_u32 s13, s10, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s11, s2
+; GCN-NEXT: s_cmp_ge_u32 s12, s2
+; GCN-NEXT: s_cselect_b32 s14, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s13, s3
+; GCN-NEXT: s_cselect_b32 s14, s14, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_subb_u32 s10, s10, s3
+; GCN-NEXT: s_sub_u32 s15, s12, s2
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_subb_u32 s0, s10, 0
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_cselect_b32 s1, s15, s12
+; GCN-NEXT: s_cselect_b32 s0, s0, s13
+; GCN-NEXT: s_cmp_lg_u32 s11, 0
+; GCN-NEXT: s_subb_u32 s9, 0, s9
+; GCN-NEXT: s_cmp_ge_u32 s9, s3
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s8, s2
; GCN-NEXT: s_cselect_b32 s2, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s0, s3
-; GCN-NEXT: s_cselect_b32 s1, s2, s1
-; GCN-NEXT: s_cmp_lg_u32 s1, 0
-; GCN-NEXT: s_cselect_b32 s0, s8, s0
-; GCN-NEXT: s_cselect_b32 s1, s9, s11
+; GCN-NEXT: s_cmp_eq_u32 s9, s3
+; GCN-NEXT: s_cselect_b32 s2, s2, s10
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cselect_b32 s0, s0, s9
+; GCN-NEXT: s_cselect_b32 s1, s1, s8
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -958,6 +970,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -990,6 +1003,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
+; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1079,6 +1093,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
+; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1108,6 +1123,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s12, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_or_b32 s14, s14, s15
+; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0
; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index e8db647..d67a7b1 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -18,6 +18,7 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_subb_u32 s3, s3, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -34,9 +35,11 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_sub_u32 s2, s2, s4
-; VI-NEXT: s_subb_u32 s3, s3, s5
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_subb_u32 s3, s3, s5
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
@@ -50,12 +53,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s4, s2, s6
-; GFX9-NEXT: s_subb_u32 s5, s3, s7
+; GFX9-NEXT: s_sub_u32 s6, s2, s6
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_subb_u32 s4, s3, s7
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -68,6 +73,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s2, s2, s6
+; GFX10-NEXT: s_cselect_b32 s4, -1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_subb_u32 s3, s3, s7
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -84,12 +91,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s2, s2, s4
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_subb_u32 s3, s3, s5
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -434,6 +443,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_sub_u32 s4, s4, s6
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
; SI-NEXT: s_or_b32 s6, s12, s13
+; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_subb_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -454,14 +464,16 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_subb_u32 s0, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -473,10 +485,12 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s0, s12, s14
-; GFX9-NEXT: s_subb_u32 s1, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_sub_u32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_subb_u32 s0, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -489,8 +503,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s0, s12, s14
-; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: s_cselect_b32 s1, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-NEXT: s_subb_u32 s1, s13, s15
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -503,8 +519,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: s_cselect_b32 s6, -1, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-NEXT: s_subb_u32 s5, s5, s7
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 28c6b40..75db387 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -774,40 +774,44 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_add_u32 s11, s12, s11
; GFX1032-NEXT: s_addc_u32 s12, 0, s13
; GFX1032-NEXT: s_add_u32 s8, s8, s11
+; GFX1032-NEXT: s_cselect_b32 s11, -1, 0
+; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8
+; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
+; GFX1032-NEXT: s_mul_i32 s11, s9, s8
; GFX1032-NEXT: s_addc_u32 s5, s5, s12
-; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8
-; GFX1032-NEXT: s_mul_i32 s12, s9, s8
-; GFX1032-NEXT: s_mul_i32 s9, s9, s5
; GFX1032-NEXT: s_mul_i32 s10, s10, s8
-; GFX1032-NEXT: s_add_i32 s9, s11, s9
-; GFX1032-NEXT: s_mul_i32 s11, s5, s12
+; GFX1032-NEXT: s_mul_i32 s9, s9, s5
+; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11
+; GFX1032-NEXT: s_add_i32 s9, s13, s9
+; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11
; GFX1032-NEXT: s_add_i32 s9, s9, s10
-; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX1032-NEXT: s_mul_i32 s10, s5, s11
; GFX1032-NEXT: s_mul_i32 s15, s8, s9
; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1032-NEXT: s_add_u32 s10, s10, s15
-; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12
+; GFX1032-NEXT: s_add_u32 s12, s12, s15
; GFX1032-NEXT: s_addc_u32 s14, 0, s14
-; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9
-; GFX1032-NEXT: s_add_u32 s10, s10, s11
+; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9
+; GFX1032-NEXT: s_add_u32 s10, s12, s10
; GFX1032-NEXT: s_mul_i32 s9, s5, s9
; GFX1032-NEXT: s_addc_u32 s10, s14, s13
-; GFX1032-NEXT: s_addc_u32 s11, s12, 0
+; GFX1032-NEXT: s_addc_u32 s11, s11, 0
; GFX1032-NEXT: s_add_u32 s9, s10, s9
; GFX1032-NEXT: s_addc_u32 s10, 0, s11
; GFX1032-NEXT: s_add_u32 s8, s8, s9
+; GFX1032-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8
+; GFX1032-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1032-NEXT: s_addc_u32 s5, s5, s10
-; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX1032-NEXT: s_mul_i32 s12, s2, s5
-; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5
-; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8
; GFX1032-NEXT: s_mul_i32 s8, s3, s8
-; GFX1032-NEXT: s_add_u32 s9, s9, s12
-; GFX1032-NEXT: s_addc_u32 s11, 0, s11
+; GFX1032-NEXT: s_mul_i32 s12, s2, s5
+; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5
+; GFX1032-NEXT: s_add_u32 s11, s11, s12
+; GFX1032-NEXT: s_addc_u32 s10, 0, s10
; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1032-NEXT: s_add_u32 s8, s9, s8
+; GFX1032-NEXT: s_add_u32 s8, s11, s8
; GFX1032-NEXT: s_mul_i32 s5, s3, s5
-; GFX1032-NEXT: s_addc_u32 s8, s11, s10
+; GFX1032-NEXT: s_addc_u32 s8, s10, s9
; GFX1032-NEXT: s_addc_u32 s9, s13, 0
; GFX1032-NEXT: s_add_u32 s5, s8, s5
; GFX1032-NEXT: s_addc_u32 s8, 0, s9
@@ -820,8 +824,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_sub_i32 s11, s3, s9
; GFX1032-NEXT: s_sub_u32 s10, s2, s10
; GFX1032-NEXT: s_cselect_b32 s12, -1, 0
+; GFX1032-NEXT: s_cmp_lg_u32 s12, 0
; GFX1032-NEXT: s_subb_u32 s11, s11, s1
; GFX1032-NEXT: s_sub_u32 s13, s10, s0
+; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
+; GFX1032-NEXT: s_cmp_lg_u32 s14, 0
; GFX1032-NEXT: s_subb_u32 s11, s11, 0
; GFX1032-NEXT: s_cmp_ge_u32 s11, s1
; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
@@ -894,8 +901,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1
-; GFX1064-NEXT: s_sub_u32 s8, 0, s0
-; GFX1064-NEXT: s_subb_u32 s9, 0, s1
+; GFX1064-NEXT: s_sub_u32 s9, 0, s0
+; GFX1064-NEXT: s_subb_u32 s10, 0, s1
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX1064-NEXT: v_rcp_f32_e32 v0, v0
; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -904,102 +911,109 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1064-NEXT: v_readfirstlane_b32 s5, v0
-; GFX1064-NEXT: s_mul_i32 s10, s8, s4
-; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5
-; GFX1064-NEXT: s_mul_i32 s11, s9, s5
-; GFX1064-NEXT: s_add_i32 s10, s12, s10
-; GFX1064-NEXT: s_mul_i32 s13, s8, s5
-; GFX1064-NEXT: s_add_i32 s10, s10, s11
-; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13
-; GFX1064-NEXT: s_mul_i32 s15, s5, s10
-; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13
-; GFX1064-NEXT: s_mul_i32 s11, s4, s13
-; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10
+; GFX1064-NEXT: v_readfirstlane_b32 s8, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
+; GFX1064-NEXT: s_mul_i32 s5, s9, s8
+; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4
+; GFX1064-NEXT: s_mul_i32 s11, s10, s4
+; GFX1064-NEXT: s_add_i32 s5, s12, s5
+; GFX1064-NEXT: s_mul_i32 s13, s9, s4
+; GFX1064-NEXT: s_add_i32 s5, s5, s11
+; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13
+; GFX1064-NEXT: s_mul_i32 s15, s4, s5
+; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13
+; GFX1064-NEXT: s_mul_i32 s11, s8, s13
+; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5
; GFX1064-NEXT: s_add_u32 s12, s12, s15
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10
+; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5
; GFX1064-NEXT: s_add_u32 s11, s12, s11
-; GFX1064-NEXT: s_mul_i32 s10, s4, s10
+; GFX1064-NEXT: s_mul_i32 s5, s8, s5
; GFX1064-NEXT: s_addc_u32 s11, s13, s14
; GFX1064-NEXT: s_addc_u32 s12, s16, 0
-; GFX1064-NEXT: s_add_u32 s10, s11, s10
+; GFX1064-NEXT: s_add_u32 s5, s11, s5
; GFX1064-NEXT: s_addc_u32 s11, 0, s12
-; GFX1064-NEXT: s_add_u32 s5, s5, s10
-; GFX1064-NEXT: s_addc_u32 s4, s4, s11
-; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5
-; GFX1064-NEXT: s_mul_i32 s11, s8, s5
-; GFX1064-NEXT: s_mul_i32 s8, s8, s4
-; GFX1064-NEXT: s_mul_i32 s9, s9, s5
-; GFX1064-NEXT: s_add_i32 s8, s10, s8
-; GFX1064-NEXT: s_mul_i32 s10, s4, s11
-; GFX1064-NEXT: s_add_i32 s8, s8, s9
-; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11
-; GFX1064-NEXT: s_mul_i32 s14, s5, s8
-; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8
-; GFX1064-NEXT: s_add_u32 s9, s9, s14
-; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11
+; GFX1064-NEXT: s_add_u32 s12, s4, s5
+; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_mul_i32 s4, s9, s12
+; GFX1064-NEXT: s_addc_u32 s8, s8, s11
+; GFX1064-NEXT: s_mul_i32 s10, s10, s12
+; GFX1064-NEXT: s_mul_i32 s9, s9, s8
+; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4
+; GFX1064-NEXT: s_add_i32 s9, s13, s9
+; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4
+; GFX1064-NEXT: s_add_i32 s9, s9, s10
+; GFX1064-NEXT: s_mul_i32 s4, s8, s4
+; GFX1064-NEXT: s_mul_i32 s14, s12, s9
+; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9
+; GFX1064-NEXT: s_add_u32 s5, s5, s14
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8
-; GFX1064-NEXT: s_add_u32 s9, s9, s10
-; GFX1064-NEXT: s_mul_i32 s8, s4, s8
-; GFX1064-NEXT: s_addc_u32 s9, s13, s12
-; GFX1064-NEXT: s_addc_u32 s10, s11, 0
-; GFX1064-NEXT: s_add_u32 s8, s9, s8
-; GFX1064-NEXT: s_addc_u32 s9, 0, s10
-; GFX1064-NEXT: s_add_u32 s5, s5, s8
-; GFX1064-NEXT: s_addc_u32 s4, s4, s9
-; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5
-; GFX1064-NEXT: s_mul_i32 s11, s2, s4
-; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4
-; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5
+; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9
+; GFX1064-NEXT: s_add_u32 s4, s5, s4
+; GFX1064-NEXT: s_mul_i32 s9, s8, s9
+; GFX1064-NEXT: s_addc_u32 s4, s13, s11
+; GFX1064-NEXT: s_addc_u32 s5, s10, 0
+; GFX1064-NEXT: s_add_u32 s4, s4, s9
+; GFX1064-NEXT: s_addc_u32 s9, 0, s5
+; GFX1064-NEXT: s_add_u32 s10, s12, s4
+; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10
+; GFX1064-NEXT: s_addc_u32 s5, s8, s9
+; GFX1064-NEXT: s_mul_i32 s8, s3, s10
+; GFX1064-NEXT: s_mul_i32 s10, s2, s5
+; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5
+; GFX1064-NEXT: s_add_u32 s10, s11, s10
+; GFX1064-NEXT: s_addc_u32 s9, 0, s9
+; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5
+; GFX1064-NEXT: s_add_u32 s8, s10, s8
; GFX1064-NEXT: s_mul_i32 s5, s3, s5
-; GFX1064-NEXT: s_add_u32 s8, s8, s11
-; GFX1064-NEXT: s_addc_u32 s10, 0, s10
-; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4
-; GFX1064-NEXT: s_add_u32 s5, s8, s5
-; GFX1064-NEXT: s_mul_i32 s4, s3, s4
-; GFX1064-NEXT: s_addc_u32 s5, s10, s9
+; GFX1064-NEXT: s_addc_u32 s4, s9, s4
; GFX1064-NEXT: s_addc_u32 s8, s12, 0
-; GFX1064-NEXT: s_add_u32 s10, s5, s4
+; GFX1064-NEXT: s_add_u32 s10, s4, s5
; GFX1064-NEXT: s_addc_u32 s11, 0, s8
; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10
; GFX1064-NEXT: s_mul_i32 s5, s0, s11
; GFX1064-NEXT: s_mul_i32 s8, s1, s10
; GFX1064-NEXT: s_add_i32 s4, s4, s5
-; GFX1064-NEXT: s_add_i32 s8, s4, s8
+; GFX1064-NEXT: s_add_i32 s12, s4, s8
; GFX1064-NEXT: s_mul_i32 s4, s0, s10
-; GFX1064-NEXT: s_sub_i32 s9, s3, s8
-; GFX1064-NEXT: s_sub_u32 s12, s2, s4
+; GFX1064-NEXT: s_sub_i32 s8, s3, s12
+; GFX1064-NEXT: s_sub_u32 s13, s2, s4
; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_subb_u32 s9, s9, s1
-; GFX1064-NEXT: s_sub_u32 s13, s12, s0
-; GFX1064-NEXT: s_subb_u32 s9, s9, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s9, s1
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_subb_u32 s14, s8, s1
+; GFX1064-NEXT: s_sub_u32 s15, s13, s0
+; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0
+; GFX1064-NEXT: s_subb_u32 s8, s14, 0
+; GFX1064-NEXT: s_cmp_ge_u32 s8, s1
+; GFX1064-NEXT: s_cselect_b32 s9, -1, 0
+; GFX1064-NEXT: s_cmp_ge_u32 s15, s0
; GFX1064-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
-; GFX1064-NEXT: s_cselect_b32 s13, -1, 0
-; GFX1064-NEXT: s_cmp_eq_u32 s9, s1
-; GFX1064-NEXT: s_cselect_b32 s9, s13, s14
-; GFX1064-NEXT: s_add_u32 s13, s10, 1
+; GFX1064-NEXT: s_cmp_eq_u32 s8, s1
+; GFX1064-NEXT: s_cselect_b32 s8, s14, s9
+; GFX1064-NEXT: s_add_u32 s9, s10, 1
; GFX1064-NEXT: s_addc_u32 s14, s11, 0
; GFX1064-NEXT: s_add_u32 s15, s10, 2
; GFX1064-NEXT: s_addc_u32 s16, s11, 0
-; GFX1064-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1064-NEXT: s_cselect_b32 s13, s15, s13
+; GFX1064-NEXT: s_cmp_lg_u32 s8, 0
+; GFX1064-NEXT: s_cselect_b32 s15, s15, s9
; GFX1064-NEXT: s_cselect_b32 s14, s16, s14
; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_subb_u32 s3, s3, s8
+; GFX1064-NEXT: s_subb_u32 s3, s3, s12
; GFX1064-NEXT: s_cmp_ge_u32 s3, s1
; GFX1064-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s12, s0
+; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
; GFX1064-NEXT: s_cselect_b32 s5, -1, 0
; GFX1064-NEXT: s_cmp_eq_u32 s3, s1
; GFX1064-NEXT: s_cselect_b32 s1, s5, s4
; GFX1064-NEXT: s_cmp_lg_u32 s1, 0
; GFX1064-NEXT: s_cselect_b32 s5, s14, s11
-; GFX1064-NEXT: s_cselect_b32 s4, s13, s10
+; GFX1064-NEXT: s_cselect_b32 s4, s15, s10
; GFX1064-NEXT: s_cbranch_execnz .LBB15_3
; GFX1064-NEXT: .LBB15_2:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 4445383..64d055b 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -271,6 +271,7 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
+; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -280,6 +281,7 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
+; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -297,6 +299,8 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
+; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -307,6 +311,7 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
+; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0
; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4
; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -316,6 +321,7 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
+; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -333,6 +339,8 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
+; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/Transforms/SCCP/constant-range-struct.ll b/llvm/test/Transforms/SCCP/constant-range-struct.ll
index 7a399df..0f45b38 100644
--- a/llvm/test/Transforms/SCCP/constant-range-struct.ll
+++ b/llvm/test/Transforms/SCCP/constant-range-struct.ll
@@ -25,7 +25,7 @@ true:
br label %exit
false:
- %s.3 = insertvalue {i64, i64} undef, i64 30, 0
+ %s.3 = insertvalue {i64, i64} poison, i64 30, 0
%s.4 = insertvalue {i64, i64} %s.3, i64 300, 1
br label %exit
@@ -39,14 +39,14 @@ define void @struct1_caller() {
; CHECK-NEXT: [[S:%.*]] = call { i64, i64 } @struct1()
; CHECK-NEXT: [[V1:%.*]] = extractvalue { i64, i64 } [[S]], 0
; CHECK-NEXT: [[V2:%.*]] = extractvalue { i64, i64 } [[S]], 1
-; CHECK-NEXT: [[T_1:%.*]] = icmp ne i64 [[V1]], 10
-; CHECK-NEXT: call void @use(i1 [[T_1]])
-; CHECK-NEXT: [[T_2:%.*]] = icmp ult i64 [[V1]], 100
-; CHECK-NEXT: call void @use(i1 [[T_2]])
-; CHECK-NEXT: [[T_3:%.*]] = icmp ne i64 [[V2]], 0
+; CHECK-NEXT: call void @use(i1 true)
+; CHECK-NEXT: call void @use(i1 true)
+; CHECK-NEXT: [[T_3:%.*]] = icmp eq i64 [[V1]], 20
; CHECK-NEXT: call void @use(i1 [[T_3]])
-; CHECK-NEXT: [[T_4:%.*]] = icmp ult i64 [[V2]], 301
-; CHECK-NEXT: call void @use(i1 [[T_4]])
+; CHECK-NEXT: call void @use(i1 true)
+; CHECK-NEXT: call void @use(i1 true)
+; CHECK-NEXT: [[T_6:%.*]] = icmp eq i64 [[V2]], 300
+; CHECK-NEXT: call void @use(i1 [[T_6]])
; CHECK-NEXT: ret void
;
%s = call {i64, i64} @struct1()
@@ -57,10 +57,14 @@ define void @struct1_caller() {
call void @use(i1 %t.1)
%t.2 = icmp ult i64 %v1, 100
call void @use(i1 %t.2)
- %t.3 = icmp ne i64 %v2, 0
+ %t.3 = icmp eq i64 %v1, 20
call void @use(i1 %t.3)
- %t.4 = icmp ult i64 %v2, 301
+ %t.4 = icmp ne i64 %v2, 0
call void @use(i1 %t.4)
+ %t.5 = icmp ult i64 %v2, 301
+ call void @use(i1 %t.5)
+ %t.6 = icmp eq i64 %v2, 300
+ call void @use(i1 %t.6)
ret void
}
@@ -76,7 +80,7 @@ define internal {i64, i64} @struct2() {
; CHECK: exit:
; CHECK-NEXT: [[V1:%.*]] = phi i64 [ 20, [[TRUE]] ], [ 30, [[FALSE]] ]
; CHECK-NEXT: [[V2:%.*]] = phi i64 [ 200, [[TRUE]] ], [ 300, [[FALSE]] ]
-; CHECK-NEXT: [[S_1:%.*]] = insertvalue { i64, i64 } undef, i64 [[V1]], 0
+; CHECK-NEXT: [[S_1:%.*]] = insertvalue { i64, i64 } poison, i64 [[V1]], 0
; CHECK-NEXT: [[S_2:%.*]] = insertvalue { i64, i64 } [[S_1]], i64 [[V2]], 1
; CHECK-NEXT: ret { i64, i64 } [[S_2]]
;
@@ -92,7 +96,7 @@ false:
exit:
%v1 = phi i64 [ 20, %true ], [ 30, %false ]
%v2 = phi i64 [ 200, %true ], [ 300, %false ]
- %s.1 = insertvalue {i64, i64} undef, i64 %v1, 0
+ %s.1 = insertvalue {i64, i64} poison, i64 %v1, 0
%s.2 = insertvalue {i64, i64} %s.1, i64 %v2, 1
ret {i64, i64} %s.2
}
@@ -153,3 +157,40 @@ define void @struct2_caller() {
ret void
}
+
+%"phi_type" = type {i64, i64}
+
+define internal %"phi_type" @test(i32 %input) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: br label [[COND_TRUE_I:%.*]]
+; CHECK: cond.true.i:
+; CHECK-NEXT: br label [[COND_END_I:%.*]]
+; CHECK: cond.end.i:
+; CHECK-NEXT: ret [[PHI_TYPE:%.*]] poison
+;
+ %cmp.cond = icmp eq i32 %input, 1
+ br i1 %cmp.cond, label %cond.true.i, label %cond.false.i
+
+cond.true.i:
+ %r1.tmp = insertvalue %"phi_type" poison, i64 1, 0
+ %r1.tmp.2 = insertvalue %"phi_type" %r1.tmp, i64 2, 1
+ br label %cond.end.i
+
+cond.false.i:
+ %r2.tmp = insertvalue %"phi_type" poison, i64 3, 0
+ %r2.tmp.2 = insertvalue %"phi_type" %r2.tmp, i64 4, 1
+ br label %cond.end.i
+
+cond.end.i:
+ %retval = phi %"phi_type" [ %r1.tmp.2, %cond.true.i ], [ %r2.tmp.2, %cond.false.i ]
+ ret %"phi_type" %retval
+}
+
+define %"phi_type" @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: [[CALL_1:%.*]] = tail call fastcc [[PHI_TYPE:%.*]] @[[TEST:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 noundef 1)
+; CHECK-NEXT: ret [[PHI_TYPE]] { i64 1, i64 2 }
+;
+ %call.1 = tail call fastcc noundef %"phi_type" @test(i32 noundef 1)
+ ret %"phi_type" %call.1
+}
diff --git a/llvm/unittests/ADT/BitTest.cpp b/llvm/unittests/ADT/BitTest.cpp
index eaed4e1..5b3df91 100644
--- a/llvm/unittests/ADT/BitTest.cpp
+++ b/llvm/unittests/ADT/BitTest.cpp
@@ -270,6 +270,22 @@ TEST(BitTest, BitWidthConstexpr) {
llvm::bit_width_constexpr(std::numeric_limits<uint64_t>::max()) == 64);
}
+TEST(BitTest, BitCeilConstexpr) {
+ static_assert(llvm::bit_ceil_constexpr(0u) == 1);
+ static_assert(llvm::bit_ceil_constexpr(1u) == 1);
+ static_assert(llvm::bit_ceil_constexpr(2u) == 2);
+ static_assert(llvm::bit_ceil_constexpr(3u) == 4);
+ static_assert(llvm::bit_ceil_constexpr(4u) == 4);
+ static_assert(llvm::bit_ceil_constexpr(5u) == 8);
+ static_assert(llvm::bit_ceil_constexpr(6u) == 8);
+ static_assert(llvm::bit_ceil_constexpr(7u) == 8);
+ static_assert(llvm::bit_ceil_constexpr(8u) == 8);
+
+ static_assert(llvm::bit_ceil_constexpr(255u) == 256);
+ static_assert(llvm::bit_ceil_constexpr(256u) == 256);
+ static_assert(llvm::bit_ceil_constexpr(257u) == 512);
+}
+
TEST(BitTest, CountlZero) {
uint8_t Z8 = 0;
uint16_t Z16 = 0;
diff --git a/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td b/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td
index 3143ab7..99b22e5 100644
--- a/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td
+++ b/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td
@@ -220,8 +220,6 @@ def YieldOp : SMTOp<"yield", [
Pure,
Terminator,
ReturnLike,
- ParentOneOf<["smt::SolverOp", "smt::CheckOp",
- "smt::ForallOp", "smt::ExistsOp"]>,
]> {
let summary = "terminator operation for various regions of SMT operations";
let arguments = (ins Variadic<AnyType>:$values);
diff --git a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h
index fc69b03..f6353a9 100644
--- a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h
+++ b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h
@@ -10,6 +10,7 @@
#define MLIR_DIALECT_TRANSFORM_SMTEXTENSION_SMTEXTENSIONOPS_H
#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/Dialect/SMT/IR/SMTOps.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
#include "mlir/IR/OpDefinition.h"
diff --git a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td
index b987cb3..9d9783a 100644
--- a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td
+++ b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td
@@ -16,7 +16,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
def ConstrainParamsOp : Op<Transform_Dialect, "smt.constrain_params", [
DeclareOpInterfaceMethods<TransformOpInterface>,
DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
- NoTerminator
+ SingleBlockImplicitTerminator<"::mlir::smt::YieldOp">
]> {
let cppNamespace = [{ mlir::transform::smt }];
@@ -24,14 +24,20 @@ def ConstrainParamsOp : Op<Transform_Dialect, "smt.constrain_params", [
let description = [{
Allows expressing constraints on params using the SMT dialect.
- Each Transform dialect param provided as an operand has a corresponding
+ Each Transform-dialect param provided as an operand has a corresponding
argument of SMT-type in the region. The SMT-Dialect ops in the region use
- these arguments as operands.
+ these params-as-SMT-vars as operands, thereby expressing relevant
+ constraints on their allowed values.
+
+ Computations w.r.t. passed-in params can also be expressed through the
+ region's SMT-ops. Namely, the constraints express relationships to other
+ SMT-variables which can then be yielded from the region (with `smt.yield`).
The semantics of this op is that all the ops in the region together express
a constraint on the params-interpreted-as-smt-vars. The op fails in case the
expressed constraint is not satisfiable per SMTLIB semantics. Otherwise the
- op succeeds.
+ op succeeds and any one satisfying assignment is used to map the
+ SMT-variables yielded in the region to `transform.param`s.
---
@@ -42,9 +48,10 @@ def ConstrainParamsOp : Op<Transform_Dialect, "smt.constrain_params", [
}];
let arguments = (ins Variadic<TransformParamTypeInterface>:$params);
+ let results = (outs Variadic<TransformParamTypeInterface>:$results);
let regions = (region SizedRegion<1>:$body);
let assemblyFormat =
- "`(` $params `)` attr-dict `:` type(operands) $body";
+ "`(` $params `)` attr-dict `:` functional-type(operands, results) $body";
let hasVerifier = 1;
}
diff --git a/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp b/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp
index 8e7af05..abc1316 100644
--- a/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp
+++ b/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp
@@ -8,8 +8,8 @@
#include "mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h"
#include "mlir/Dialect/SMT/IR/SMTDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Transform/SMTExtension/SMTExtension.h"
+#include "mlir/Dialect/SMT/IR/SMTOps.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
using namespace mlir;
@@ -23,6 +23,7 @@ using namespace mlir;
void transform::smt::ConstrainParamsOp::getEffects(
SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
onlyReadsHandle(getParamsMutable(), effects);
+ producesHandle(getResults(), effects);
}
DiagnosedSilenceableFailure
@@ -37,19 +38,95 @@ transform::smt::ConstrainParamsOp::apply(transform::TransformRewriter &rewriter,
// and allow for users to attach their own implementation, which would,
// e.g., translate the ops to SMTLIB and hand that over to the user's
// favourite solver. This requires changes to the dialect's verifier.
- return emitDefiniteFailure() << "op does not have interpreted semantics yet";
+ return emitSilenceableFailure(getLoc())
+ << "op does not have interpreted semantics yet";
}
LogicalResult transform::smt::ConstrainParamsOp::verify() {
+ auto yieldTerminator =
+ dyn_cast<mlir::smt::YieldOp>(getRegion().front().back());
+ if (!yieldTerminator)
+ return emitOpError() << "expected '"
+ << mlir::smt::YieldOp::getOperationName()
+ << "' as terminator";
+
+ auto checkTypes = [](size_t idx, Type smtType, StringRef smtDesc,
+ Type paramType, StringRef paramDesc,
+ auto *atOp) -> InFlightDiagnostic {
+ if (!isa<mlir::smt::BoolType, mlir::smt::IntType, mlir::smt::BitVectorType>(
+ smtType))
+ return atOp->emitOpError() << "the type of " << smtDesc << " #" << idx
+ << " is expected to be either a !smt.bool, a "
+ "!smt.int, or a !smt.bv";
+
+ assert(isa<TransformParamTypeInterface>(paramType) &&
+ "ODS specifies params' type should implement param interface");
+ if (isa<transform::AnyParamType>(paramType))
+ return {}; // No further checks can be done.
+
+ // NB: This cast must succeed as long as the only implementors of
+ // TransformParamTypeInterface are AnyParamType and ParamType.
+ Type typeWrappedByParam = cast<ParamType>(paramType).getType();
+
+ if (isa<mlir::smt::IntType>(smtType)) {
+ if (!isa<IntegerType>(typeWrappedByParam))
+ return atOp->emitOpError()
+ << "the type of " << smtDesc << " #" << idx
+ << " is !smt.int though the corresponding " << paramDesc
+ << " type (" << paramType << ") is not wrapping an integer type";
+ } else if (isa<mlir::smt::BoolType>(smtType)) {
+ auto wrappedIntType = dyn_cast<IntegerType>(typeWrappedByParam);
+ if (!wrappedIntType || wrappedIntType.getWidth() != 1)
+ return atOp->emitOpError()
+ << "the type of " << smtDesc << " #" << idx
+ << " is !smt.bool though the corresponding " << paramDesc
+ << " type (" << paramType << ") is not wrapping i1";
+ } else if (auto bvSmtType = dyn_cast<mlir::smt::BitVectorType>(smtType)) {
+ auto wrappedIntType = dyn_cast<IntegerType>(typeWrappedByParam);
+ if (!wrappedIntType || wrappedIntType.getWidth() != bvSmtType.getWidth())
+ return atOp->emitOpError()
+ << "the type of " << smtDesc << " #" << idx << " is " << smtType
+ << " though the corresponding " << paramDesc << " type ("
+ << paramType
+ << ") is not wrapping an integer type of the same bitwidth";
+ }
+
+ return {};
+ };
+
if (getOperands().size() != getBody().getNumArguments())
return emitOpError(
"must have the same number of block arguments as operands");
+ for (auto [idx, operandType, blockArgType] :
+ llvm::enumerate(getOperandTypes(), getBody().getArgumentTypes())) {
+ InFlightDiagnostic typeCheckResult =
+ checkTypes(idx, blockArgType, "block arg", operandType, "operand",
+ /*atOp=*/this);
+ if (LogicalResult(typeCheckResult).failed())
+ return typeCheckResult;
+ }
+
for (auto &op : getBody().getOps()) {
if (!isa<mlir::smt::SMTDialect>(op.getDialect()))
return emitOpError(
"ops contained in region should belong to SMT-dialect");
}
+ if (yieldTerminator->getNumOperands() != getNumResults())
+ return yieldTerminator.emitOpError()
+ << "expected terminator to have as many operands as the parent op "
+ "has results";
+
+ for (auto [idx, termOperandType, resultType] : llvm::enumerate(
+ yieldTerminator->getOperands().getType(), getResultTypes())) {
+ InFlightDiagnostic typeCheckResult =
+ checkTypes(idx, termOperandType, "terminator operand",
+ cast<transform::ParamType>(resultType), "result",
+ /*atOp=*/&yieldTerminator);
+ if (LogicalResult(typeCheckResult).failed())
+ return typeCheckResult;
+ }
+
return success();
}
diff --git a/mlir/python/mlir/dialects/transform/smt.py b/mlir/python/mlir/dialects/transform/smt.py
index 1f0b7f0..af88fff 100644
--- a/mlir/python/mlir/dialects/transform/smt.py
+++ b/mlir/python/mlir/dialects/transform/smt.py
@@ -19,6 +19,7 @@ except ImportError as e:
class ConstrainParamsOp(ConstrainParamsOp):
def __init__(
self,
+ results: Sequence[Type],
params: Sequence[transform.AnyParamType],
arg_types: Sequence[Type],
loc=None,
@@ -27,6 +28,7 @@ class ConstrainParamsOp(ConstrainParamsOp):
if len(params) != len(arg_types):
raise ValueError(f"{params=} not same length as {arg_types=}")
super().__init__(
+ results,
params,
loc=loc,
ip=ip,
@@ -36,3 +38,13 @@ class ConstrainParamsOp(ConstrainParamsOp):
@property
def body(self) -> Block:
return self.regions[0].blocks[0]
+
+
+def constrain_params(
+ results: Sequence[Type],
+ params: Sequence[transform.AnyParamType],
+ arg_types: Sequence[Type],
+ loc=None,
+ ip=None,
+):
+ return ConstrainParamsOp(results, params, arg_types, loc=loc, ip=ip)
diff --git a/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir b/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir
index 314b8d4..d91d69a 100644
--- a/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir
+++ b/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir
@@ -1,11 +1,40 @@
// RUN: mlir-opt %s --transform-interpreter --split-input-file --verify-diagnostics
+// CHECK-LABEL: @incorrect terminator
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @operands_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) {
+ %param_as_param = transform.param.constant 42 -> !transform.param<i64>
+ // expected-error@below {{op expected 'smt.yield' as terminator}}
+ transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () {
+ ^bb0(%param_as_smt_var: !smt.int):
+ transform.yield
+ }
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: @operands_not_one_to_one_with_vars
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @operands_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) {
+ %param_as_param = transform.param.constant 42 -> !transform.param<i64>
+ // expected-error@below {{must have the same number of block arguments as operands}}
+ transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () {
+ ^bb0(%param_as_smt_var: !smt.int, %param_as_another_smt_var: !smt.int):
+ }
+ transform.yield
+ }
+}
+
+// -----
+
// CHECK-LABEL: @constraint_not_using_smt_ops
module attributes {transform.with_named_sequence} {
transform.named_sequence @constraint_not_using_smt_ops(%arg0: !transform.any_op {transform.readonly}) {
%param_as_param = transform.param.constant 42 -> !transform.param<i64>
// expected-error@below {{ops contained in region should belong to SMT-dialect}}
- transform.smt.constrain_params(%param_as_param) : !transform.param<i64> {
+ transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () {
^bb0(%param_as_smt_var: !smt.int):
%c4 = arith.constant 4 : i32
// This is the kind of thing one might think works:
@@ -17,13 +46,90 @@ module attributes {transform.with_named_sequence} {
// -----
-// CHECK-LABEL: @operands_not_one_to_one_with_vars
+// CHECK-LABEL: @results_not_one_to_one_with_vars
module attributes {transform.with_named_sequence} {
- transform.named_sequence @operands_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) {
+ transform.named_sequence @results_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) {
%param_as_param = transform.param.constant 42 -> !transform.param<i64>
- // expected-error@below {{must have the same number of block arguments as operands}}
- transform.smt.constrain_params(%param_as_param) : !transform.param<i64> {
+ transform.smt.constrain_params(%param_as_param, %param_as_param) : (!transform.param<i64>, !transform.param<i64>) -> () {
^bb0(%param_as_smt_var: !smt.int, %param_as_another_smt_var: !smt.int):
+ // expected-error@below {{expected terminator to have as many operands as the parent op has results}}
+ smt.yield %param_as_smt_var : !smt.int
+ }
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: @non_smt_type_block_args
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @non_smt_type_block_args(%arg0: !transform.any_op {transform.readonly}) {
+ %param_as_param = transform.param.constant 42 -> !transform.param<i8>
+ // expected-error@below {{the type of block arg #0 is expected to be either a !smt.bool, a !smt.int, or a !smt.bv}}
+ transform.smt.constrain_params(%param_as_param) : (!transform.param<i8>) -> (!transform.param<i8>) {
+ ^bb0(%param_as_smt_var: !transform.param<i8>):
+ smt.yield %param_as_smt_var : !transform.param<i8>
+ }
+ transform.yield
+ }
+}
+
+
+// -----
+
+// CHECK-LABEL: @mismatched_arg_type_bool
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @mismatched_arg_type_bool(%arg0: !transform.any_op {transform.readonly}) {
+ %param_as_param = transform.param.constant 42 -> !transform.param<i64>
+ // expected-error@below {{the type of block arg #0 is !smt.bool though the corresponding operand type ('!transform.param<i64>') is not wrapping i1}}
+ transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> (!transform.param<i64>) {
+ ^bb0(%param_as_smt_var: !smt.bool):
+ smt.yield %param_as_smt_var : !smt.bool
+ }
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: @mismatched_arg_type_bitvector
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @mismatched_arg_type_bitvector(%arg0: !transform.any_op {transform.readonly}) {
+ %param_as_param = transform.param.constant 42 -> !transform.param<i64>
+ // expected-error@below {{the type of block arg #0 is '!smt.bv<8>' though the corresponding operand type ('!transform.param<i64>') is not wrapping an integer type of the same bitwidth}}
+ transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> (!transform.param<i64>) {
+ ^bb0(%param_as_smt_var: !smt.bv<8>):
+ smt.yield %param_as_smt_var : !smt.bv<8>
+ }
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: @mismatched_result_type_bool
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @mismatched_result_type_bool(%arg0: !transform.any_op {transform.readonly}) {
+ %param_as_param = transform.param.constant 1 -> !transform.param<i1>
+ transform.smt.constrain_params(%param_as_param) : (!transform.param<i1>) -> (!transform.param<i64>) {
+ ^bb0(%param_as_smt_var: !smt.bool):
+ // expected-error@below {{the type of terminator operand #0 is !smt.bool though the corresponding result type ('!transform.param<i64>') is not wrapping i1}}
+ smt.yield %param_as_smt_var : !smt.bool
+ }
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: @mismatched_result_type_bitvector
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @mismatched_result_type_bitvector(%arg0: !transform.any_op {transform.readonly}) {
+ %param_as_param = transform.param.constant 42 -> !transform.param<i8>
+ transform.smt.constrain_params(%param_as_param) : (!transform.param<i8>) -> (!transform.param<i64>) {
+ ^bb0(%param_as_smt_var: !smt.bv<8>):
+ // expected-error@below {{the type of terminator operand #0 is '!smt.bv<8>' though the corresponding result type ('!transform.param<i64>') is not wrapping an integer type of the same bitwidth}}
+ smt.yield %param_as_smt_var : !smt.bv<8>
}
transform.yield
}
diff --git a/mlir/test/Dialect/Transform/test-smt-extension.mlir b/mlir/test/Dialect/Transform/test-smt-extension.mlir
index 29d1517..6cc41dd 100644
--- a/mlir/test/Dialect/Transform/test-smt-extension.mlir
+++ b/mlir/test/Dialect/Transform/test-smt-extension.mlir
@@ -7,7 +7,7 @@ module attributes {transform.with_named_sequence} {
%param_as_param = transform.param.constant 42 -> !transform.param<i64>
// CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]])
- transform.smt.constrain_params(%param_as_param) : !transform.param<i64> {
+ transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () {
// CHECK: ^bb{{.*}}(%[[PARAM_AS_SMT_SYMB:.*]]: !smt.int):
^bb0(%param_as_smt_var: !smt.int):
// CHECK: %[[C0:.*]] = smt.int.constant 0
@@ -31,18 +31,20 @@ module attributes {transform.with_named_sequence} {
// -----
-// CHECK-LABEL: @schedule_with_constraint_on_multiple_params
+// CHECK-LABEL: @schedule_with_constraint_on_multiple_params_returning_computed_value
module attributes {transform.with_named_sequence} {
- transform.named_sequence @schedule_with_constraint_on_multiple_params(%arg0: !transform.any_op {transform.readonly}) {
+ transform.named_sequence @schedule_with_constraint_on_multiple_params_returning_computed_value(%arg0: !transform.any_op {transform.readonly}) {
// CHECK: %[[PARAM_A:.*]] = transform.param.constant
%param_a = transform.param.constant 4 -> !transform.param<i64>
// CHECK: %[[PARAM_B:.*]] = transform.param.constant
- %param_b = transform.param.constant 16 -> !transform.param<i64>
+ %param_b = transform.param.constant 32 -> !transform.param<i64>
// CHECK: transform.smt.constrain_params(%[[PARAM_A]], %[[PARAM_B]])
- transform.smt.constrain_params(%param_a, %param_b) : !transform.param<i64>, !transform.param<i64> {
+ %divisor = transform.smt.constrain_params(%param_a, %param_b) : (!transform.param<i64>, !transform.param<i64>) -> (!transform.param<i64>) {
// CHECK: ^bb{{.*}}(%[[VAR_A:.*]]: !smt.int, %[[VAR_B:.*]]: !smt.int):
^bb0(%var_a: !smt.int, %var_b: !smt.int):
+ // CHECK: %[[DIV:.*]] = smt.int.div %[[VAR_B]], %[[VAR_A]]
+ %divisor = smt.int.div %var_b, %var_a
// CHECK: %[[C0:.*]] = smt.int.constant 0
%c0 = smt.int.constant 0
// CHECK: %[[REMAINDER:.*]] = smt.int.mod %[[VAR_B]], %[[VAR_A]]
@@ -51,8 +53,11 @@ module attributes {transform.with_named_sequence} {
%eq = smt.eq %remainder, %c0 : !smt.int
// CHECK: smt.assert %[[EQ]]
smt.assert %eq
+ // CHECK: smt.yield %[[DIV]]
+ smt.yield %divisor : !smt.int
}
- // NB: from here can rely on that %param_a is a divisor of %param_b
+ // NB: from here can rely on that %param_a is a divisor of %param_b and
+ // that the relevant factor, 8, got associated to %divisor.
transform.yield
}
}
@@ -63,10 +68,10 @@ module attributes {transform.with_named_sequence} {
module attributes {transform.with_named_sequence} {
transform.named_sequence @schedule_with_param_as_a_bool(%arg0: !transform.any_op {transform.readonly}) {
// CHECK: %[[PARAM_AS_PARAM:.*]] = transform.param.constant
- %param_as_param = transform.param.constant true -> !transform.any_param
+ %param_as_param = transform.param.constant true -> !transform.param<i1>
// CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]])
- transform.smt.constrain_params(%param_as_param) : !transform.any_param {
+ transform.smt.constrain_params(%param_as_param) : (!transform.param<i1>) -> () {
// CHECK: ^bb{{.*}}(%[[PARAM_AS_SMT_VAR:.*]]: !smt.bool):
^bb0(%param_as_smt_var: !smt.bool):
// CHECK: %[[C0:.*]] = smt.int.constant 0
diff --git a/mlir/test/python/dialects/transform_smt_ext.py b/mlir/test/python/dialects/transform_smt_ext.py
index 3692fd9..e28c56f 100644
--- a/mlir/test/python/dialects/transform_smt_ext.py
+++ b/mlir/test/python/dialects/transform_smt_ext.py
@@ -25,26 +25,44 @@ def run(f):
# CHECK-LABEL: TEST: testConstrainParamsOp
@run
def testConstrainParamsOp(target):
- dummy_value = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 42)
+ c42_attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 42)
# CHECK: %[[PARAM_AS_PARAM:.*]] = transform.param.constant
- symbolic_value = transform.ParamConstantOp(
- transform.AnyParamType.get(), dummy_value
+ symbolic_value_as_param = transform.ParamConstantOp(
+ transform.AnyParamType.get(), c42_attr
)
# CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]])
constrain_params = transform_smt.ConstrainParamsOp(
- [symbolic_value], [smt.IntType.get()]
+ [], [symbolic_value_as_param], [smt.IntType.get()]
)
# CHECK-NEXT: ^bb{{.*}}(%[[PARAM_AS_SMT_SYMB:.*]]: !smt.int):
with ir.InsertionPoint(constrain_params.body):
+ symbolic_value_as_smt_var = constrain_params.body.arguments[0]
# CHECK: %[[C0:.*]] = smt.int.constant 0
c0 = smt.IntConstantOp(ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 0))
# CHECK: %[[C43:.*]] = smt.int.constant 43
c43 = smt.IntConstantOp(ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 43))
# CHECK: %[[LB:.*]] = smt.int.cmp le %[[C0]], %[[PARAM_AS_SMT_SYMB]]
- lb = smt.IntCmpOp(smt.IntPredicate.le, c0, constrain_params.body.arguments[0])
+ lb = smt.IntCmpOp(smt.IntPredicate.le, c0, symbolic_value_as_smt_var)
# CHECK: %[[UB:.*]] = smt.int.cmp le %[[PARAM_AS_SMT_SYMB]], %[[C43]]
- ub = smt.IntCmpOp(smt.IntPredicate.le, constrain_params.body.arguments[0], c43)
+ ub = smt.IntCmpOp(smt.IntPredicate.le, symbolic_value_as_smt_var, c43)
# CHECK: %[[BOUNDED:.*]] = smt.and %[[LB]], %[[UB]]
bounded = smt.AndOp([lb, ub])
# CHECK: smt.assert %[[BOUNDED:.*]]
smt.AssertOp(bounded)
+ smt.YieldOp([])
+
+ # CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]])
+ compute_with_params = transform_smt.ConstrainParamsOp(
+ [transform.ParamType.get(ir.IntegerType.get_signless(32))],
+ [symbolic_value_as_param],
+ [smt.IntType.get()],
+ )
+ # CHECK-NEXT: ^bb{{.*}}(%[[SMT_SYMB:.*]]: !smt.int):
+ with ir.InsertionPoint(compute_with_params.body):
+ symbolic_value_as_smt_var = compute_with_params.body.arguments[0]
+ # CHECK: %[[TWICE:.*]] = smt.int.add %[[SMT_SYMB]], %[[SMT_SYMB]]
+ twice_symb = smt.IntAddOp(
+ [symbolic_value_as_smt_var, symbolic_value_as_smt_var]
+ )
+ # CHECK: smt.yield %[[TWICE]]
+ smt.YieldOp([twice_symb])