diff options
146 files changed, 4256 insertions, 2844 deletions
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp index b752a9b..21455db 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp @@ -154,6 +154,7 @@ template <> struct ScalarEnumerationTraits<clang::DiagnosticIDs::Level> { } }; template <> struct SequenceElementTraits<ClangTidyOptions::CustomCheckDiag> { + // NOLINTNEXTLINE(readability-identifier-naming) Defined by YAMLTraits.h static const bool flow = false; }; template <> struct MappingTraits<ClangTidyOptions::CustomCheckDiag> { @@ -165,6 +166,7 @@ template <> struct MappingTraits<ClangTidyOptions::CustomCheckDiag> { } }; template <> struct SequenceElementTraits<ClangTidyOptions::CustomCheckValue> { + // NOLINTNEXTLINE(readability-identifier-naming) Defined by YAMLTraits.h static const bool flow = false; }; template <> struct MappingTraits<ClangTidyOptions::CustomCheckValue> { diff --git a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp index d7cc0ca..a58c041 100644 --- a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp @@ -45,7 +45,10 @@ struct StrCatCheckResult { std::vector<FixItHint> Hints; }; -void removeCallLeaveArgs(const CallExpr *Call, StrCatCheckResult *CheckResult) { +} // namespace + +static void removeCallLeaveArgs(const CallExpr *Call, + StrCatCheckResult *CheckResult) { if (Call->getNumArgs() == 0) return; // Remove 'Foo(' @@ -58,9 +61,9 @@ void removeCallLeaveArgs(const CallExpr *Call, StrCatCheckResult *CheckResult) { Call->getRParenLoc(), Call->getEndLoc().getLocWithOffset(1)))); } -const clang::CallExpr *processArgument(const Expr *Arg, - const MatchFinder::MatchResult &Result, - StrCatCheckResult *CheckResult) { +static const clang::CallExpr * +processArgument(const Expr *Arg, const MatchFinder::MatchResult &Result, + StrCatCheckResult *CheckResult) { const auto IsAlphanum = hasDeclaration(cxxMethodDecl(hasName("AlphaNum"))); static const auto *const Strcat = new auto(hasName("::absl::StrCat")); const auto IsStrcat = cxxBindTemporaryExpr( @@ -78,8 +81,8 @@ const clang::CallExpr *processArgument(const Expr *Arg, return nullptr; } -StrCatCheckResult processCall(const CallExpr *RootCall, bool IsAppend, - const MatchFinder::MatchResult &Result) { +static StrCatCheckResult processCall(const CallExpr *RootCall, bool IsAppend, + const MatchFinder::MatchResult &Result) { StrCatCheckResult CheckResult; std::deque<const CallExpr *> CallsToProcess = {RootCall}; @@ -101,7 +104,6 @@ StrCatCheckResult processCall(const CallExpr *RootCall, bool IsAppend, } return CheckResult; } -} // namespace void RedundantStrcatCallsCheck::check(const MatchFinder::MatchResult &Result) { bool IsAppend = false; diff --git a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp index 6aad3c6..e90cdd0 100644 --- a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp +++ b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp @@ -215,13 +215,13 @@ bool UnrollLoopsCheck::hasLargeNumIterations(const Stmt *Statement, break; case (BO_MulAssign): Iterations = - 1 + (std::log((double)EndValue) - std::log((double)InitValue)) / - std::log((double)ConstantValue); + 1 + ((std::log((double)EndValue) - std::log((double)InitValue)) / + std::log((double)ConstantValue)); break; case (BO_DivAssign): Iterations = - 1 + (std::log((double)InitValue) - std::log((double)EndValue)) / - std::log((double)ConstantValue); + 1 + ((std::log((double)InitValue) - std::log((double)EndValue)) / + std::log((double)ConstantValue)); break; default: // All other operators are not handled; assume large bounds. diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp index cd83423..48c54c0 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp +++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp @@ -16,12 +16,13 @@ using namespace clang::ast_matchers; namespace clang::tidy::android { -namespace { // Helper function to form the correct string mode for Type3. // Build the replace text. If it's string constant, add <Mode> directly in the // end of the string. Else, add <Mode>. -std::string buildFixMsgForStringFlag(const Expr *Arg, const SourceManager &SM, - const LangOptions &LangOpts, char Mode) { +static std::string buildFixMsgForStringFlag(const Expr *Arg, + const SourceManager &SM, + const LangOptions &LangOpts, + char Mode) { if (Arg->getBeginLoc().isMacroID()) return (Lexer::getSourceText( CharSourceRange::getTokenRange(Arg->getSourceRange()), SM, @@ -32,11 +33,6 @@ std::string buildFixMsgForStringFlag(const Expr *Arg, const SourceManager &SM, StringRef SR = cast<StringLiteral>(Arg->IgnoreParenCasts())->getString(); return ("\"" + SR + Twine(Mode) + "\"").str(); } -} // namespace - -const char *CloexecCheck::FuncDeclBindingStr = "funcDecl"; - -const char *CloexecCheck::FuncBindingStr = "func"; void CloexecCheck::registerMatchersImpl( MatchFinder *Finder, internal::Matcher<FunctionDecl> Function) { diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.h b/clang-tools-extra/clang-tidy/android/CloexecCheck.h index 79f7ab3..b2b59f5 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.h @@ -89,10 +89,10 @@ protected: int N) const; /// Binding name of the FuncDecl of a function call. - static const char *FuncDeclBindingStr; + static constexpr char FuncDeclBindingStr[] = "funcDecl"; /// Binding name of the function call expression. - static const char *FuncBindingStr; + static constexpr char FuncBindingStr[] = "func"; }; } // namespace clang::tidy::android diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp index d8207b3..b4ee351 100644 --- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp @@ -1074,7 +1074,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From, WorkType = To; } - if (Ctx.hasSameType(WorkType, To)) { + if (ASTContext::hasSameType(WorkType, To)) { LLVM_DEBUG(llvm::dbgs() << "<<< approximateStdConv. Reached 'To' type.\n"); return {Ctx.getCommonSugaredType(WorkType, To)}; } diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp index 390f3dd..54ed899 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp @@ -18,8 +18,11 @@ namespace { AST_MATCHER(Expr, isInMacro) { return Node.getBeginLoc().isMacroID(); } +} // namespace + /// Find the next statement after `S`. -const Stmt *nextStmt(const MatchFinder::MatchResult &Result, const Stmt *S) { +static const Stmt *nextStmt(const MatchFinder::MatchResult &Result, + const Stmt *S) { auto Parents = Result.Context->getParents(*S); if (Parents.empty()) return nullptr; @@ -40,8 +43,8 @@ using ExpansionRanges = std::vector<SourceRange>; /// \brief Get all the macro expansion ranges related to `Loc`. /// /// The result is ordered from most inner to most outer. -ExpansionRanges getExpansionRanges(SourceLocation Loc, - const MatchFinder::MatchResult &Result) { +static ExpansionRanges +getExpansionRanges(SourceLocation Loc, const MatchFinder::MatchResult &Result) { ExpansionRanges Locs; while (Loc.isMacroID()) { Locs.push_back( @@ -51,8 +54,6 @@ ExpansionRanges getExpansionRanges(SourceLocation Loc, return Locs; } -} // namespace - void MultipleStatementMacroCheck::registerMatchers(MatchFinder *Finder) { const auto Inner = expr(isInMacro(), unless(compoundStmt())).bind("inner"); Finder->addMatcher( diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp index 86af5cb..c262b1c 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp @@ -245,12 +245,10 @@ struct OptionEnumMapping< namespace bugprone { -namespace { - /// Returns if a function is declared inside a system header. /// These functions are considered to be "standard" (system-provided) library /// functions. -bool isStandardFunction(const FunctionDecl *FD) { +static bool isStandardFunction(const FunctionDecl *FD) { // Find a possible redeclaration in system header. // FIXME: Looking at the canonical declaration is not the most exact way // to do this. @@ -284,7 +282,7 @@ bool isStandardFunction(const FunctionDecl *FD) { /// Check if a statement is "C++-only". /// This includes all statements that have a class name with "CXX" prefix /// and every other statement that is declared in file ExprCXX.h. -bool isCXXOnlyStmt(const Stmt *S) { +static bool isCXXOnlyStmt(const Stmt *S) { StringRef Name = S->getStmtClassName(); if (Name.starts_with("CXX")) return true; @@ -304,7 +302,8 @@ bool isCXXOnlyStmt(const Stmt *S) { /// called from \p Caller, get a \c CallExpr of the corresponding function call. /// It is unspecified which call is found if multiple calls exist, but the order /// should be deterministic (depend only on the AST). -Expr *findCallExpr(const CallGraphNode *Caller, const CallGraphNode *Callee) { +static Expr *findCallExpr(const CallGraphNode *Caller, + const CallGraphNode *Callee) { const auto *FoundCallee = llvm::find_if( Caller->callees(), [Callee](const CallGraphNode::CallRecord &Call) { return Call.Callee == Callee; @@ -314,7 +313,7 @@ Expr *findCallExpr(const CallGraphNode *Caller, const CallGraphNode *Callee) { return FoundCallee->CallExpr; } -SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) { +static SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) { ParentMapContext &PM = Ctx.getParentMapContext(); DynTypedNode P = DynTypedNode::create(*S); while (P.getSourceRange().isInvalid()) { @@ -326,9 +325,9 @@ SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) { return P.getSourceRange(); } -AST_MATCHER(FunctionDecl, isStandardFunction) { - return isStandardFunction(&Node); -} +namespace { + +AST_MATCHER(FunctionDecl, isStandard) { return isStandardFunction(&Node); } } // namespace @@ -354,7 +353,7 @@ bool SignalHandlerCheck::isLanguageVersionSupported( void SignalHandlerCheck::registerMatchers(MatchFinder *Finder) { auto SignalFunction = functionDecl(hasAnyName("::signal", "::std::signal"), - parameterCountIs(2), isStandardFunction()); + parameterCountIs(2), isStandard()); auto HandlerExpr = declRefExpr(hasDeclaration(functionDecl().bind("handler_decl")), unless(isExpandedFromMacro("SIG_IGN")), diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp index cdb6a08..cf55dd7 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp @@ -424,7 +424,7 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) { "suspicious usage of 'sizeof(array)/sizeof(...)';" " denominator differs from the size of array elements") << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange(); - } else if (NumTy && DenomTy && Ctx.hasSameType(NumTy, DenomTy) && + } else if (NumTy && DenomTy && ASTContext::hasSameType(NumTy, DenomTy) && !NumTy->isDependentType()) { // Dependent type should not be compared. diag(E->getOperatorLoc(), @@ -433,7 +433,7 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) { << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange(); } else if (!WarnOnSizeOfPointer) { // When 'WarnOnSizeOfPointer' is enabled, these messages become redundant: - if (PointedTy && DenomTy && Ctx.hasSameType(PointedTy, DenomTy)) { + if (PointedTy && DenomTy && ASTContext::hasSameType(PointedTy, DenomTy)) { diag(E->getOperatorLoc(), "suspicious usage of 'sizeof(...)/sizeof(...)'; size of pointer " "is divided by size of pointed type") @@ -462,8 +462,8 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) { const auto *SizeOfExpr = Result.Nodes.getNodeAs<UnaryExprOrTypeTraitExpr>("sizeof-ptr-mul-expr"); - if (Ctx.hasSameType(LPtrTy, RPtrTy) && - Ctx.hasSameType(LPtrTy, SizeofArgTy)) { + if (ASTContext::hasSameType(LPtrTy, RPtrTy) && + ASTContext::hasSameType(LPtrTy, SizeofArgTy)) { diag(SizeOfExpr->getBeginLoc(), "suspicious usage of 'sizeof(...)' in " "pointer arithmetic") << SizeOfExpr->getSourceRange() << E->getOperatorLoc() @@ -477,8 +477,8 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) { const auto *SizeOfExpr = Result.Nodes.getNodeAs<UnaryExprOrTypeTraitExpr>("sizeof-ptr-div-expr"); - if (Ctx.hasSameType(LPtrTy, RPtrTy) && - Ctx.hasSameType(LPtrTy, SizeofArgTy)) { + if (ASTContext::hasSameType(LPtrTy, RPtrTy) && + ASTContext::hasSameType(LPtrTy, SizeofArgTy)) { diag(SizeOfExpr->getBeginLoc(), "suspicious usage of 'sizeof(...)' in " "pointer arithmetic") << SizeOfExpr->getSourceRange() << E->getOperatorLoc() diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp index 0c8d2b8..cef8b4d 100644 --- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp @@ -50,7 +50,7 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context, return false; // Check if return types are identical. - if (Context->hasSameType(DerivedReturnTy, BaseReturnTy)) + if (ASTContext::hasSameType(DerivedReturnTy, BaseReturnTy)) return true; /// Check if the return types are covariant. @@ -77,7 +77,7 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context, if (DRD == BRD) return true; - if (!Context->hasSameUnqualifiedType(DTy, BTy)) { + if (!ASTContext::hasSameUnqualifiedType(DTy, BTy)) { // Begin checking whether the conversion from D to B is valid. CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true, /*DetectVirtual=*/false); @@ -87,7 +87,8 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context, return false; // Check ambiguity. - if (Paths.isAmbiguous(Context->getCanonicalType(BTy).getUnqualifiedType())) + if (Paths.isAmbiguous( + ASTContext::getCanonicalType(BTy).getUnqualifiedType())) return false; // Check accessibility. diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp index 37d737a..1ac9b8b 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp @@ -28,10 +28,13 @@ AST_MATCHER(CXXRecordDecl, hasDefaultConstructor) { return Node.hasDefaultConstructor(); } +} // namespace + // Iterate over all the fields in a record type, both direct and indirect (e.g. // if the record contains an anonymous struct). template <typename T, typename Func> -void forEachField(const RecordDecl &Record, const T &Fields, const Func &Fn) { +static void forEachField(const RecordDecl &Record, const T &Fields, + const Func &Fn) { for (const FieldDecl *F : Fields) { if (F->isAnonymousStructOrUnion()) { if (const CXXRecordDecl *R = F->getType()->getAsCXXRecordDecl()) @@ -43,8 +46,9 @@ void forEachField(const RecordDecl &Record, const T &Fields, const Func &Fn) { } template <typename T, typename Func> -void forEachFieldWithFilter(const RecordDecl &Record, const T &Fields, - bool &AnyMemberHasInitPerUnion, const Func &Fn) { +static void forEachFieldWithFilter(const RecordDecl &Record, const T &Fields, + bool &AnyMemberHasInitPerUnion, + const Func &Fn) { for (const FieldDecl *F : Fields) { if (F->isAnonymousStructOrUnion()) { if (const CXXRecordDecl *R = F->getType()->getAsCXXRecordDecl()) { @@ -59,8 +63,9 @@ void forEachFieldWithFilter(const RecordDecl &Record, const T &Fields, } } -void removeFieldInitialized(const FieldDecl *M, - SmallPtrSetImpl<const FieldDecl *> &FieldDecls) { +static void +removeFieldInitialized(const FieldDecl *M, + SmallPtrSetImpl<const FieldDecl *> &FieldDecls) { const RecordDecl *R = M->getParent(); if (R && R->isUnion()) { // Erase all members in a union if any member of it is initialized. @@ -70,9 +75,9 @@ void removeFieldInitialized(const FieldDecl *M, FieldDecls.erase(M); } -void removeFieldsInitializedInBody( - const Stmt &Stmt, ASTContext &Context, - SmallPtrSetImpl<const FieldDecl *> &FieldDecls) { +static void +removeFieldsInitializedInBody(const Stmt &Stmt, ASTContext &Context, + SmallPtrSetImpl<const FieldDecl *> &FieldDecls) { auto Matches = match(findAll(binaryOperator( hasOperatorName("="), @@ -82,9 +87,9 @@ void removeFieldsInitializedInBody( removeFieldInitialized(Match.getNodeAs<FieldDecl>("fieldDecl"), FieldDecls); } -StringRef getName(const FieldDecl *Field) { return Field->getName(); } +static StringRef getName(const FieldDecl *Field) { return Field->getName(); } -StringRef getName(const RecordDecl *Record) { +static StringRef getName(const RecordDecl *Record) { // Get the typedef name if this is a C-style anonymous struct and typedef. if (const TypedefNameDecl *Typedef = Record->getTypedefNameForAnonDecl()) return Typedef->getName(); @@ -94,7 +99,7 @@ StringRef getName(const RecordDecl *Record) { // Creates comma separated list of decls requiring initialization in order of // declaration. template <typename R, typename T> -std::string +static std::string toCommaSeparatedString(const R &OrderedDecls, const SmallPtrSetImpl<const T *> &DeclsToInit) { SmallVector<StringRef, 16> Names; @@ -105,12 +110,14 @@ toCommaSeparatedString(const R &OrderedDecls, return llvm::join(Names.begin(), Names.end(), ", "); } -SourceLocation getLocationForEndOfToken(const ASTContext &Context, - SourceLocation Location) { +static SourceLocation getLocationForEndOfToken(const ASTContext &Context, + SourceLocation Location) { return Lexer::getLocForEndOfToken(Location, 0, Context.getSourceManager(), Context.getLangOpts()); } +namespace { + // There are 3 kinds of insertion placements: enum class InitializerPlacement { // 1. The fields are inserted after an existing CXXCtorInitializer stored in @@ -187,15 +194,17 @@ struct InitializerInsertion { SmallVector<std::string, 4> Initializers; }; +} // namespace + // Convenience utility to get a RecordDecl from a QualType. -const RecordDecl *getCanonicalRecordDecl(const QualType &Type) { +static const RecordDecl *getCanonicalRecordDecl(const QualType &Type) { if (const auto *RT = Type->getAsCanonical<RecordType>()) return RT->getDecl(); return nullptr; } template <typename R, typename T> -SmallVector<InitializerInsertion, 16> +static SmallVector<InitializerInsertion, 16> computeInsertions(const CXXConstructorDecl::init_const_range &Inits, const R &OrderedDecls, const SmallPtrSetImpl<const T *> &DeclsToInit) { @@ -239,8 +248,9 @@ computeInsertions(const CXXConstructorDecl::init_const_range &Inits, // Gets the list of bases and members that could possibly be initialized, in // order as they appear in the class declaration. -void getInitializationsInOrder(const CXXRecordDecl &ClassDecl, - SmallVectorImpl<const NamedDecl *> &Decls) { +static void +getInitializationsInOrder(const CXXRecordDecl &ClassDecl, + SmallVectorImpl<const NamedDecl *> &Decls) { Decls.clear(); for (const auto &Base : ClassDecl.bases()) { // Decl may be null if the base class is a template parameter. @@ -253,9 +263,10 @@ void getInitializationsInOrder(const CXXRecordDecl &ClassDecl, } template <typename T> -void fixInitializerList(const ASTContext &Context, DiagnosticBuilder &Diag, - const CXXConstructorDecl *Ctor, - const SmallPtrSetImpl<const T *> &DeclsToInit) { +static void fixInitializerList(const ASTContext &Context, + DiagnosticBuilder &Diag, + const CXXConstructorDecl *Ctor, + const SmallPtrSetImpl<const T *> &DeclsToInit) { // Do not propose fixes in macros since we cannot place them correctly. if (Ctor->getBeginLoc().isMacroID()) return; @@ -271,8 +282,6 @@ void fixInitializerList(const ASTContext &Context, DiagnosticBuilder &Diag, } } -} // anonymous namespace - ProTypeMemberInitCheck::ProTypeMemberInitCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), diff --git a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp index 0d81b9a..bd51cc5 100644 --- a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp @@ -111,10 +111,10 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder, } RewriteRuleWith<std::string> useNewMlirOpBuilderCheckRule() { - Stencil message = cat("use 'OpType::create(builder, ...)' instead of " + Stencil Message = cat("use 'OpType::create(builder, ...)' instead of " "'builder.create<OpType>(...)'"); // Match a create call on an OpBuilder. - ast_matchers::internal::Matcher<Stmt> base = + ast_matchers::internal::Matcher<Stmt> Base = cxxMemberCallExpr( on(expr(hasType( cxxRecordDecl(isSameOrDerivedFrom("::mlir::OpBuilder")))) @@ -124,10 +124,10 @@ RewriteRuleWith<std::string> useNewMlirOpBuilderCheckRule() { .bind("call"); return applyFirst( // Attempt rewrite given an lvalue builder, else just warn. - {makeRule(cxxMemberCallExpr(unless(on(cxxTemporaryObjectExpr())), base), + {makeRule(cxxMemberCallExpr(unless(on(cxxTemporaryObjectExpr())), Base), rewrite(node("call"), node("builder"), callArgs("call")), - message), - makeRule(base, noopEdit(node("call")), message)}); + Message), + makeRule(Base, noopEdit(node("call")), Message)}); } } // namespace diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp index 8ec7695..3b9b8e0 100644 --- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp @@ -60,12 +60,12 @@ AST_MATCHER_P(CoawaitExpr, awaitable, ast_matchers::internal::Matcher<Expr>, return InnerMatcher.matches(*E, Finder, Builder); return false; } +} // namespace -auto typeWithNameIn(const std::vector<StringRef> &Names) { +static auto typeWithNameIn(const std::vector<StringRef> &Names) { return hasType( hasCanonicalType(hasDeclaration(namedDecl(hasAnyName(Names))))); } -} // namespace CoroutineHostileRAIICheck::CoroutineHostileRAIICheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp index 5e0f32a..9801c9e 100644 --- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp @@ -53,7 +53,7 @@ AST_MATCHER(FunctionDecl, isPlacementOverload) { const auto *FPT = Node.getType()->castAs<FunctionProtoType>(); ASTContext &Ctx = Node.getASTContext(); if (Ctx.getLangOpts().SizedDeallocation && - Ctx.hasSameType(FPT->getParamType(1), Ctx.getSizeType())) + ASTContext::hasSameType(FPT->getParamType(1), Ctx.getSizeType())) return false; return true; diff --git a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp index 0d7667c..035598d 100644 --- a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp @@ -151,10 +151,12 @@ constexpr unsigned SmallSCCSize = 32; using CallStackTy = llvm::SmallVector<CallGraphNode::CallRecord, SmallCallStackSize>; +} // namespace + // In given SCC, find *some* call stack that will be cyclic. // This will only find *one* such stack, it might not be the smallest one, // and there may be other loops. -CallStackTy pathfindSomeCycle(ArrayRef<CallGraphNode *> SCC) { +static CallStackTy pathfindSomeCycle(ArrayRef<CallGraphNode *> SCC) { // We'll need to be able to performantly look up whether some CallGraphNode // is in SCC or not, so cache all the SCC elements in a set. const ImmutableSmallSet<CallGraphNode *, SmallSCCSize> SCCElts(SCC); @@ -190,8 +192,6 @@ CallStackTy pathfindSomeCycle(ArrayRef<CallGraphNode *> SCC) { return CallStack; } -} // namespace - void NoRecursionCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher(translationUnitDecl().bind("TUDecl"), this); } diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp index 17a8a50..6baa12a 100644 --- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp @@ -29,7 +29,6 @@ using namespace clang::ast_matchers; using namespace clang::tidy::matchers; namespace clang::tidy::misc { -namespace { using llvm::APSInt; static constexpr llvm::StringLiteral KnownBannedMacroNames[] = { @@ -420,6 +419,8 @@ markDuplicateOperands(const TExpr *TheExpr, return Duplicates.any(); } +namespace { + AST_MATCHER(Expr, isIntegerConstantExpr) { if (Node.isInstantiationDependent()) return false; @@ -470,6 +471,8 @@ AST_MATCHER_P(Expr, expandedByMacro, ArrayRef<llvm::StringLiteral>, Names) { return false; } +} // namespace + // Returns a matcher for integer constant expressions. static ast_matchers::internal::Matcher<Expr> matchIntegerConstantExpr(StringRef Id) { @@ -805,7 +808,8 @@ static bool isSameRawIdentifierToken(const Token &T1, const Token &T2, StringRef(SM.getCharacterData(T2.getLocation()), T2.getLength()); } -bool isTokAtEndOfExpr(SourceRange ExprSR, Token T, const SourceManager &SM) { +static bool isTokAtEndOfExpr(SourceRange ExprSR, Token T, + const SourceManager &SM) { return SM.getExpansionLoc(ExprSR.getEnd()) == T.getLocation(); } @@ -921,7 +925,6 @@ static bool areExprsSameMacroOrLiteral(const BinaryOperator *BinOp, return false; } -} // namespace void RedundantExpressionCheck::registerMatchers(MatchFinder *Finder) { const auto BannedIntegerLiteral = diff --git a/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp b/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp index 27ddb7c..ab2077b 100644 --- a/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp @@ -53,9 +53,8 @@ void UniqueptrResetReleaseCheck::registerMatchers(MatchFinder *Finder) { this); } -namespace { -const Type *getDeleterForUniquePtr(const MatchFinder::MatchResult &Result, - StringRef ID) { +static const Type * +getDeleterForUniquePtr(const MatchFinder::MatchResult &Result, StringRef ID) { const auto *Class = Result.Nodes.getNodeAs<ClassTemplateSpecializationDecl>(ID); if (!Class) @@ -66,7 +65,7 @@ const Type *getDeleterForUniquePtr(const MatchFinder::MatchResult &Result, return DeleterArgument.getAsType().getTypePtr(); } -bool areDeletersCompatible(const MatchFinder::MatchResult &Result) { +static bool areDeletersCompatible(const MatchFinder::MatchResult &Result) { const Type *LeftDeleterType = getDeleterForUniquePtr(Result, "left_class"); const Type *RightDeleterType = getDeleterForUniquePtr(Result, "right_class"); @@ -103,8 +102,6 @@ bool areDeletersCompatible(const MatchFinder::MatchResult &Result) { return false; } -} // namespace - void UniqueptrResetReleaseCheck::check(const MatchFinder::MatchResult &Result) { if (!areDeletersCompatible(Result)) return; diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp index 37482583..fea5ac6 100644 --- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp @@ -499,7 +499,7 @@ static bool canBeModified(ASTContext *Context, const Expr *E) { return true; if (const auto *Cast = Parents[0].get<ImplicitCastExpr>()) { if ((Cast->getCastKind() == CK_NoOp && - Context->hasSameType(Cast->getType(), E->getType().withConst())) || + ASTContext::hasSameType(Cast->getType(), E->getType().withConst())) || (Cast->getCastKind() == CK_LValueToRValue && !Cast->getType().isNull() && Cast->getType()->isFundamentalType())) return false; @@ -664,7 +664,8 @@ void LoopConvertCheck::doConversion( AliasVarIsRef = true; } if (Descriptor.ElemType.isNull() || - !Context->hasSameUnqualifiedType(AliasVarType, Descriptor.ElemType)) + !ASTContext::hasSameUnqualifiedType(AliasVarType, + Descriptor.ElemType)) Descriptor.ElemType = AliasVarType; } @@ -944,7 +945,7 @@ bool LoopConvertCheck::isConvertible(ASTContext *Context, CanonicalInitVarType->isPointerType()) { // If the initializer and the variable are both pointers check if the // un-qualified pointee types match, otherwise we don't use auto. - return Context->hasSameUnqualifiedType( + return ASTContext::hasSameUnqualifiedType( CanonicalBeginType->getPointeeType(), CanonicalInitVarType->getPointeeType()); } diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp index 286c39b..586deea 100644 --- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp +++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp @@ -370,7 +370,7 @@ static bool isAliasDecl(ASTContext *Context, const Decl *TheDecl, DeclarationType = DeclarationType.getNonReferenceType(); if (InitType.isNull() || DeclarationType.isNull() || - !Context->hasSameUnqualifiedType(DeclarationType, InitType)) + !ASTContext::hasSameUnqualifiedType(DeclarationType, InitType)) return false; } diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp index c7fd0a9..01796a6 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp @@ -316,7 +316,7 @@ void UseAutoCheck::replaceIterators(const DeclStmt *D, ASTContext *Context) { if (NestedConstruct->getConstructor()->isConvertingConstructor(false)) return; } - if (!Context->hasSameType(V->getType(), E->getType())) + if (!ASTContext::hasSameType(V->getType(), E->getType())) return; } @@ -378,7 +378,7 @@ void UseAutoCheck::replaceExpr( return; // If VarDecl and Initializer have mismatching unqualified types. - if (!Context->hasSameUnqualifiedType(V->getType(), GetType(Expr))) + if (!ASTContext::hasSameUnqualifiedType(V->getType(), GetType(Expr))) return; // All subsequent variables in this declaration should have the same diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp index b921819..b6834c6 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp @@ -21,8 +21,6 @@ using namespace llvm; namespace clang::tidy::modernize { namespace { -const char CastSequence[] = "sequence"; - AST_MATCHER(Type, sugaredNullptrType) { const Type *DesugaredType = Node.getUnqualifiedDesugaredType(); if (const auto *BT = dyn_cast<BuiltinType>(DesugaredType)) @@ -30,6 +28,10 @@ AST_MATCHER(Type, sugaredNullptrType) { return false; } +} // namespace + +static const char CastSequence[] = "sequence"; + /// Create a matcher that finds implicit casts as well as the head of a /// sequence of zero or more nested explicit casts that have an implicit cast /// to null within. @@ -43,7 +45,8 @@ AST_MATCHER(Type, sugaredNullptrType) { /// would check for the "NULL" macro instead, but that'd be harder to express. /// In practice, "NULL" is often defined as "__null", and this is a useful /// condition. -StatementMatcher makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) { +static StatementMatcher +makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) { auto ImplicitCastToNull = implicitCastExpr( anyOf(hasCastKind(CK_NullToPointer), hasCastKind(CK_NullToMemberPointer)), anyOf(hasSourceExpression(gnuNullExpr()), @@ -79,16 +82,16 @@ StatementMatcher makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) { unless(hasAncestor(functionDecl(isDefaulted())))))); } -bool isReplaceableRange(SourceLocation StartLoc, SourceLocation EndLoc, - const SourceManager &SM) { +static bool isReplaceableRange(SourceLocation StartLoc, SourceLocation EndLoc, + const SourceManager &SM) { return SM.isWrittenInSameFile(StartLoc, EndLoc); } /// Replaces the provided range with the text "nullptr", but only if /// the start and end location are both in main file. /// Returns true if and only if a replacement was made. -void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM, - SourceLocation StartLoc, SourceLocation EndLoc) { +static void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM, + SourceLocation StartLoc, SourceLocation EndLoc) { CharSourceRange Range(SourceRange(StartLoc, EndLoc), true); // Add a space if nullptr follows an alphanumeric character. This happens // whenever there is an c-style explicit cast to nullptr not surrounded by @@ -106,8 +109,9 @@ void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM, /// #define MY_NULL NULL /// \endcode /// If \p Loc points to NULL, this function will return the name MY_NULL. -StringRef getOutermostMacroName(SourceLocation Loc, const SourceManager &SM, - const LangOptions &LO) { +static StringRef getOutermostMacroName(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LO) { assert(Loc.isMacroID()); SourceLocation OutermostMacroLoc; @@ -119,6 +123,8 @@ StringRef getOutermostMacroName(SourceLocation Loc, const SourceManager &SM, return Lexer::getImmediateMacroName(OutermostMacroLoc, SM, LO); } +namespace { + /// RecursiveASTVisitor for ensuring all nodes rooted at a given AST /// subtree that have file-level source locations corresponding to a macro /// argument have implicit NullTo(Member)Pointer nodes as ancestors. diff --git a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp index d26480f..7c90130 100644 --- a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp @@ -17,9 +17,8 @@ using namespace clang::ast_matchers; namespace clang::tidy::performance { -namespace { - -std::optional<std::string> makeCharacterLiteral(const StringLiteral *Literal) { +static std::optional<std::string> +makeCharacterLiteral(const StringLiteral *Literal) { std::string Result; { llvm::raw_string_ostream OS(Result); @@ -43,6 +42,8 @@ std::optional<std::string> makeCharacterLiteral(const StringLiteral *Literal) { return Result; } +namespace { + AST_MATCHER_FUNCTION(ast_matchers::internal::Matcher<Expr>, hasSubstitutedType) { return hasType(qualType(anyOf(substTemplateTypeParmType(), diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp index 3da1469..4a8f292 100644 --- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp @@ -17,8 +17,6 @@ using namespace clang::ast_matchers; namespace clang::tidy::performance { -namespace { - // Matcher names. Given the code: // // \code @@ -60,12 +58,14 @@ static const char LoopInitVarName[] = "loop_init_var"; static const char LoopEndExprName[] = "loop_end_expr"; static const char RangeLoopName[] = "for_range_loop"; -ast_matchers::internal::Matcher<Expr> supportedContainerTypesMatcher() { +static ast_matchers::internal::Matcher<Expr> supportedContainerTypesMatcher() { return hasType(cxxRecordDecl(hasAnyName( "::std::vector", "::std::set", "::std::unordered_set", "::std::map", "::std::unordered_map", "::std::array", "::std::deque"))); } +namespace { + AST_MATCHER(Expr, hasSideEffects) { return Node.HasSideEffects(Finder->getASTContext()); } diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp index 570a109..0237c05 100644 --- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp @@ -64,6 +64,8 @@ private: const SourceManager &SM; }; +} // namespace + void DuplicateIncludeCallbacks::FileChanged(SourceLocation Loc, FileChangeReason Reason, SrcMgr::CharacteristicKind FileType, @@ -107,8 +109,6 @@ void DuplicateIncludeCallbacks::MacroUndefined(const Token &MacroNameTok, Files.back().clear(); } -} // namespace - void DuplicateIncludeCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { PP->addPPCallbacks(std::make_unique<DuplicateIncludeCallbacks>(*this, SM)); diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp index bfdf9cb..6f6da57 100644 --- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp @@ -51,7 +51,7 @@ static StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind, return Type->isUnsignedIntegerType() ? "0u" : "0"; case CK_FloatingToBoolean: - return Context.hasSameType(Type, Context.FloatTy) ? "0.0f" : "0.0"; + return ASTContext::hasSameType(Type, Context.FloatTy) ? "0.0f" : "0.0"; case CK_PointerToBoolean: case CK_MemberPointerToBoolean: // Fall-through on purpose. @@ -215,7 +215,7 @@ getEquivalentForBoolLiteral(const CXXBoolLiteralExpr *BoolLiteral, } if (DestType->isFloatingType()) { - if (Context.hasSameType(DestType, Context.FloatTy)) { + if (ASTContext::hasSameType(DestType, Context.FloatTy)) { return BoolLiteral->getValue() ? "1.0f" : "0.0f"; } return BoolLiteral->getValue() ? "1.0" : "0.0"; diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp index 2eb26fc..93580a7 100644 --- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp @@ -54,9 +54,12 @@ struct InconsistentDeclarationInfo { using InconsistentDeclarationsContainer = llvm::SmallVector<InconsistentDeclarationInfo, 2>; -bool checkIfFixItHintIsApplicable( - const FunctionDecl *ParameterSourceDeclaration, - const ParmVarDecl *SourceParam, const FunctionDecl *OriginalDeclaration) { +} // namespace + +static bool +checkIfFixItHintIsApplicable(const FunctionDecl *ParameterSourceDeclaration, + const ParmVarDecl *SourceParam, + const FunctionDecl *OriginalDeclaration) { // Assumptions with regard to function declarations/definition: // * If both function declaration and definition are seen, assume that // definition is most up-to-date, and use it to generate replacements. @@ -83,7 +86,7 @@ bool checkIfFixItHintIsApplicable( return true; } -bool nameMatch(StringRef L, StringRef R, bool Strict) { +static bool nameMatch(StringRef L, StringRef R, bool Strict) { if (Strict) return L.empty() || R.empty() || L == R; // We allow two names if one is a prefix/suffix of the other, ignoring case. @@ -92,7 +95,7 @@ bool nameMatch(StringRef L, StringRef R, bool Strict) { L.ends_with_insensitive(R) || R.ends_with_insensitive(L); } -DifferingParamsContainer +static DifferingParamsContainer findDifferingParamsInDeclaration(const FunctionDecl *ParameterSourceDeclaration, const FunctionDecl *OtherDeclaration, const FunctionDecl *OriginalDeclaration, @@ -129,7 +132,7 @@ findDifferingParamsInDeclaration(const FunctionDecl *ParameterSourceDeclaration, return DifferingParams; } -InconsistentDeclarationsContainer +static InconsistentDeclarationsContainer findInconsistentDeclarations(const FunctionDecl *OriginalDeclaration, const FunctionDecl *ParameterSourceDeclaration, SourceManager &SM, bool Strict) { @@ -162,7 +165,7 @@ findInconsistentDeclarations(const FunctionDecl *OriginalDeclaration, return InconsistentDeclarations; } -const FunctionDecl * +static const FunctionDecl * getParameterSourceDeclaration(const FunctionDecl *OriginalDeclaration) { const FunctionTemplateDecl *PrimaryTemplate = OriginalDeclaration->getPrimaryTemplate(); @@ -187,7 +190,7 @@ getParameterSourceDeclaration(const FunctionDecl *OriginalDeclaration) { return OriginalDeclaration; } -std::string joinParameterNames( +static std::string joinParameterNames( const DifferingParamsContainer &DifferingParams, llvm::function_ref<StringRef(const DifferingParamInfo &)> ChooseParamName) { llvm::SmallString<40> Str; @@ -202,7 +205,7 @@ std::string joinParameterNames( return std::string(Str); } -void formatDifferingParamsDiagnostic( +static void formatDifferingParamsDiagnostic( InconsistentDeclarationParameterNameCheck *Check, SourceLocation Location, StringRef OtherDeclarationDescription, const DifferingParamsContainer &DifferingParams) { @@ -230,7 +233,7 @@ void formatDifferingParamsDiagnostic( } } -void formatDiagnosticsForDeclarations( +static void formatDiagnosticsForDeclarations( InconsistentDeclarationParameterNameCheck *Check, const FunctionDecl *ParameterSourceDeclaration, const FunctionDecl *OriginalDeclaration, @@ -256,7 +259,7 @@ void formatDiagnosticsForDeclarations( } } -void formatDiagnostics( +static void formatDiagnostics( InconsistentDeclarationParameterNameCheck *Check, const FunctionDecl *ParameterSourceDeclaration, const FunctionDecl *OriginalDeclaration, @@ -279,8 +282,6 @@ void formatDiagnostics( } } -} // anonymous namespace - void InconsistentDeclarationParameterNameCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "IgnoreMacros", IgnoreMacros); diff --git a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp index dc9510d..942a0a8 100644 --- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp @@ -142,12 +142,11 @@ void QualifiedAutoCheck::registerMatchers(MatchFinder *Finder) { if (this->IgnoreAliasing) { return qualType( hasUnqualifiedDesugaredType(pointerType(pointee(InnerMatchers...)))); - } else { - return qualType( - anyOf(qualType(pointerType(pointee(InnerMatchers...))), - qualType(substTemplateTypeParmType(hasReplacementType( - pointerType(pointee(InnerMatchers...))))))); } + return qualType(anyOf(qualType(pointerType(pointee(InnerMatchers...))), + qualType(substTemplateTypeParmType(hasReplacementType( + pointerType(pointee(InnerMatchers...))))))); + }; auto IsAutoDeducedToPointer = diff --git a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp index 0598683..107291d 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp @@ -14,8 +14,8 @@ using namespace clang::ast_matchers; namespace clang::tidy::readability { -namespace { -internal::Matcher<Expr> callToGet(const internal::Matcher<Decl> &OnClass) { +static internal::Matcher<Expr> +callToGet(const internal::Matcher<Decl> &OnClass) { return expr( anyOf(cxxMemberCallExpr( on(expr(anyOf(hasType(OnClass), @@ -43,12 +43,13 @@ internal::Matcher<Expr> callToGet(const internal::Matcher<Decl> &OnClass) { .bind("redundant_get"); } -internal::Matcher<Decl> knownSmartptr() { +static internal::Matcher<Decl> knownSmartptr() { return recordDecl(hasAnyName("::std::unique_ptr", "::std::shared_ptr")); } -void registerMatchersForGetArrowStart(MatchFinder *Finder, - MatchFinder::MatchCallback *Callback) { +static void +registerMatchersForGetArrowStart(MatchFinder *Finder, + MatchFinder::MatchCallback *Callback) { const auto MatchesOpArrow = allOf(hasName("operator->"), returns(qualType(pointsTo(type().bind("op->Type"))))); @@ -100,8 +101,8 @@ void registerMatchersForGetArrowStart(MatchFinder *Finder, Callback); } -void registerMatchersForGetEquals(MatchFinder *Finder, - MatchFinder::MatchCallback *Callback) { +static void registerMatchersForGetEquals(MatchFinder *Finder, + MatchFinder::MatchCallback *Callback) { // This one is harder to do with duck typing. // The operator==/!= that we are looking for might be member or non-member, // might be on global namespace or found by ADL, might be a template, etc. @@ -118,8 +119,6 @@ void registerMatchersForGetEquals(MatchFinder *Finder, // FIXME: Match and fix if (l.get() == r.get()). } -} // namespace - void RedundantSmartptrGetCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "IgnoreMacros", IgnoreMacros); diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp index d1738f1..feb248d 100644 --- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp @@ -288,8 +288,8 @@ static bool applyDiceHeuristic(StringRef Arg, StringRef Param, std::size_t Intersection = 0; // Find the intersection between the two sets. - for (auto IT = ParamBigrams.begin(); IT != ParamBigrams.end(); ++IT) - Intersection += ArgBigrams.count((IT->getKey())); + for (const auto &[Key, _] : ParamBigrams) + Intersection += ArgBigrams.count(Key); // Calculate Dice coefficient. return percentage(Intersection * 2.0, diff --git a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp index c1dc209..740a68d 100644 --- a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp @@ -55,8 +55,10 @@ struct NewSuffix { std::optional<FixItHint> FixIt; }; -std::optional<SourceLocation> getMacroAwareLocation(SourceLocation Loc, - const SourceManager &SM) { +} // namespace + +static std::optional<SourceLocation> +getMacroAwareLocation(SourceLocation Loc, const SourceManager &SM) { // Do nothing if the provided location is invalid. if (Loc.isInvalid()) return std::nullopt; @@ -67,8 +69,8 @@ std::optional<SourceLocation> getMacroAwareLocation(SourceLocation Loc, return SpellingLoc; } -std::optional<SourceRange> getMacroAwareSourceRange(SourceRange Loc, - const SourceManager &SM) { +static std::optional<SourceRange> +getMacroAwareSourceRange(SourceRange Loc, const SourceManager &SM) { std::optional<SourceLocation> Begin = getMacroAwareLocation(Loc.getBegin(), SM); std::optional<SourceLocation> End = getMacroAwareLocation(Loc.getEnd(), SM); @@ -77,7 +79,7 @@ std::optional<SourceRange> getMacroAwareSourceRange(SourceRange Loc, return SourceRange(*Begin, *End); } -std::optional<std::string> +static std::optional<std::string> getNewSuffix(llvm::StringRef OldSuffix, const std::vector<StringRef> &NewSuffixes) { // If there is no config, just uppercase the entirety of the suffix. @@ -96,7 +98,7 @@ getNewSuffix(llvm::StringRef OldSuffix, } template <typename LiteralType> -std::optional<NewSuffix> +static std::optional<NewSuffix> shouldReplaceLiteralSuffix(const Expr &Literal, const std::vector<StringRef> &NewSuffixes, const SourceManager &SM, const LangOptions &LO) { @@ -174,8 +176,6 @@ shouldReplaceLiteralSuffix(const Expr &Literal, return ReplacementDsc; } -} // namespace - UppercaseLiteralSuffixCheck::UppercaseLiteralSuffixCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp index 57453ad..a5b0883 100644 --- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp +++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp @@ -19,9 +19,8 @@ namespace clang::tidy::utils::decl_ref_expr { using namespace ::clang::ast_matchers; using llvm::SmallPtrSet; -namespace { - -template <typename S> bool isSetDifferenceEmpty(const S &S1, const S &S2) { +template <typename S> +static bool isSetDifferenceEmpty(const S &S1, const S &S2) { for (auto E : S1) if (S2.count(E) == 0) return false; @@ -30,15 +29,15 @@ template <typename S> bool isSetDifferenceEmpty(const S &S1, const S &S2) { // Extracts all Nodes keyed by ID from Matches and inserts them into Nodes. template <typename Node> -void extractNodesByIdTo(ArrayRef<BoundNodes> Matches, StringRef ID, - SmallPtrSet<const Node *, 16> &Nodes) { +static void extractNodesByIdTo(ArrayRef<BoundNodes> Matches, StringRef ID, + SmallPtrSet<const Node *, 16> &Nodes) { for (const auto &Match : Matches) Nodes.insert(Match.getNodeAs<Node>(ID)); } // Returns true if both types refer to the same type, // ignoring the const-qualifier. -bool isSameTypeIgnoringConst(QualType A, QualType B) { +static bool isSameTypeIgnoringConst(QualType A, QualType B) { A = A.getCanonicalType(); B = B.getCanonicalType(); A.addConst(); @@ -47,7 +46,8 @@ bool isSameTypeIgnoringConst(QualType A, QualType B) { } // Returns true if `D` and `O` have the same parameter types. -bool hasSameParameterTypes(const CXXMethodDecl &D, const CXXMethodDecl &O) { +static bool hasSameParameterTypes(const CXXMethodDecl &D, + const CXXMethodDecl &O) { if (D.getNumParams() != O.getNumParams()) return false; for (int I = 0, E = D.getNumParams(); I < E; ++I) { @@ -60,7 +60,7 @@ bool hasSameParameterTypes(const CXXMethodDecl &D, const CXXMethodDecl &O) { // If `D` has a const-qualified overload with otherwise identical // ref-qualifiers and parameter types, returns that overload. -const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) { +static const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) { assert(!D.isConst()); DeclContext::lookup_result LookupResult = @@ -81,7 +81,7 @@ const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) { // Returns true if both types are pointers or reference to the same type, // ignoring the const-qualifier. -bool pointsToSameTypeIgnoringConst(QualType A, QualType B) { +static bool pointsToSameTypeIgnoringConst(QualType A, QualType B) { assert(A->isPointerType() || A->isReferenceType()); assert(B->isPointerType() || B->isReferenceType()); return isSameTypeIgnoringConst(A->getPointeeType(), B->getPointeeType()); @@ -122,7 +122,7 @@ bool pointsToSameTypeIgnoringConst(QualType A, QualType B) { // // This function checks (A) ad (B), but the caller should make sure that the // object is not mutated through the return value. -bool isLikelyShallowConst(const CXXMethodDecl &M) { +static bool isLikelyShallowConst(const CXXMethodDecl &M) { assert(!M.isConst()); // The method can mutate our variable. @@ -146,6 +146,8 @@ bool isLikelyShallowConst(const CXXMethodDecl &M) { return isSameTypeIgnoringConst(CallTy, OverloadTy); } +namespace { + // A matcher that matches DeclRefExprs that are used in ways such that the // underlying declaration is not modified. // If the declaration is of pointer type, `Indirections` specifies the level diff --git a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp index 044f89b..b068ae2 100644 --- a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp +++ b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp @@ -19,8 +19,6 @@ namespace clang::tidy::utils { -namespace { - /// Returns true if Name is reserved, like _Foo or __Vector_base. static inline bool isReservedName(llvm::StringRef Name) { // This doesn't catch all cases, but the most common. @@ -28,6 +26,8 @@ static inline bool isReservedName(llvm::StringRef Name) { (isUppercase(Name[1]) || Name[1] == '_'); } +namespace { + // Helper class to iterate over the designator names of an aggregate type. // // For an array type, yields [0], [1], [2]... @@ -112,6 +112,8 @@ private: RecordDecl::field_iterator FieldsEnd; }; +} // namespace + // Collect designator labels describing the elements of an init list. // // This function contributes the designators of some (sub)object, which is @@ -127,10 +129,9 @@ private: // '.a:' is produced directly without recursing into the written sublist. // (The written sublist will have a separate collectDesignators() call later). // Recursion with Prefix='.b' and Sem = {3, ImplicitValue} produces '.b.x:'. -void collectDesignators(const InitListExpr *Sem, - llvm::DenseMap<SourceLocation, std::string> &Out, - const llvm::DenseSet<SourceLocation> &NestedBraces, - std::string &Prefix) { +static void collectDesignators( + const InitListExpr *Sem, llvm::DenseMap<SourceLocation, std::string> &Out, + const llvm::DenseSet<SourceLocation> &NestedBraces, std::string &Prefix) { if (!Sem || Sem->isTransparent()) return; assert(Sem->isSemanticForm()); @@ -170,8 +171,6 @@ void collectDesignators(const InitListExpr *Sem, } } -} // namespace - llvm::DenseMap<SourceLocation, std::string> getUnwrittenDesignators(const InitListExpr *Syn) { assert(Syn->isSyntacticForm()); diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index b746df5..570cab2 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -245,7 +245,7 @@ the configuration (without a prefix: ``Auto``). .. note:: This currently only applies to braced initializer lists (when - ``Cpp11BracedListStyle`` is ``true``) and parentheses. + ``Cpp11BracedListStyle`` is not ``Block``) and parentheses. @@ -3816,29 +3816,72 @@ the configuration (without a prefix: ``Auto``). .. _Cpp11BracedListStyle: -**Cpp11BracedListStyle** (``Boolean``) :versionbadge:`clang-format 3.4` :ref:`¶ <Cpp11BracedListStyle>` - If ``true``, format braced lists as best suited for C++11 braced - lists. +**Cpp11BracedListStyle** (``BracedListStyle``) :versionbadge:`clang-format 3.4` :ref:`¶ <Cpp11BracedListStyle>` + The style to handle braced lists. - Important differences: + Possible values: - * No spaces inside the braced list. - * No line break before the closing brace. - * Indentation with the continuation indent, not with the block indent. + * ``BLS_Block`` (in configuration: ``Block``) + Best suited for pre C++11 braced lists. - Fundamentally, C++11 braced lists are formatted exactly like function - calls would be formatted in their place. If the braced list follows a name - (e.g. a type or variable name), clang-format formats as if the ``{}`` were - the parentheses of a function call with that name. If there is no name, - a zero-length name is assumed. + * Spaces inside the braced list. + * Line break before the closing brace. + * Indentation with the block indent. + + + .. code-block:: c++ + + vector<int> x{ 1, 2, 3, 4 }; + vector<T> x{ {}, {}, {}, {} }; + f(MyMap[{ composite, key }]); + new int[3]{ 1, 2, 3 }; + Type name{ // Comment + value + }; + + * ``BLS_FunctionCall`` (in configuration: ``FunctionCall``) + Best suited for C++11 braced lists. + + * No spaces inside the braced list. + * No line break before the closing brace. + * Indentation with the continuation indent. + + Fundamentally, C++11 braced lists are formatted exactly like function + calls would be formatted in their place. If the braced list follows a + name (e.g. a type or variable name), clang-format formats as if the + ``{}`` were the parentheses of a function call with that name. If there + is no name, a zero-length name is assumed. + + .. code-block:: c++ + + vector<int> x{1, 2, 3, 4}; + vector<T> x{{}, {}, {}, {}}; + f(MyMap[{composite, key}]); + new int[3]{1, 2, 3}; + Type name{ // Comment + value}; + + * ``BLS_AlignFirstComment`` (in configuration: ``AlignFirstComment``) + Same as ``FunctionCall``, except for the handling of a comment at the + begin, it then aligns everything following with the comment. + + * No spaces inside the braced list. (Even for a comment at the first + position.) + * No line break before the closing brace. + * Indentation with the continuation indent, except when followed by a + line comment, then it uses the block indent. + + + .. code-block:: c++ + + vector<int> x{1, 2, 3, 4}; + vector<T> x{{}, {}, {}, {}}; + f(MyMap[{composite, key}]); + new int[3]{1, 2, 3}; + Type name{// Comment + value}; - .. code-block:: c++ - true: false: - vector<int> x{1, 2, 3, 4}; vs. vector<int> x{ 1, 2, 3, 4 }; - vector<T> x{{}, {}, {}, {}}; vector<T> x{ {}, {}, {}, {} }; - f(MyMap[{composite, key}]); f(MyMap[{ composite, key }]); - new int[3]{1, 2, 3}; new int[3]{ 1, 2, 3 }; .. _DeriveLineEnding: @@ -6625,7 +6668,7 @@ the configuration (without a prefix: ``Auto``). .. note:: This option doesn't apply to initializer braces if - ``Cpp11BracedListStyle`` is set to ``true``. + ``Cpp11BracedListStyle`` is not ``Block``. Possible values: diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 62c70fba..d03c778 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -124,13 +124,13 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in { } let Features = "ssse3" in { - def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">; def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; } let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">; def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">; } @@ -608,7 +608,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">; def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">; - def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">; def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; @@ -661,6 +660,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def psrawi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">; def psradi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">; + def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">; def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; @@ -1386,13 +1386,10 @@ let Features = "avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth def vpshufbitqmb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def pmulhrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; -} - let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def pavgb512 : X86Builtin<"_Vector<64, unsigned char>(_Vector<64, unsigned char>, _Vector<64, unsigned char>)">; def pavgw512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, unsigned short>, _Vector<32, unsigned short>)">; + def pmulhrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; def pmulhuw512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, unsigned short>, _Vector<32, unsigned short>)">; def pmulhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; } diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index 3df5b92..2852c4a 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -94,7 +94,7 @@ struct FormatStyle { /// /// \note /// This currently only applies to braced initializer lists (when - /// ``Cpp11BracedListStyle`` is ``true``) and parentheses. + /// ``Cpp11BracedListStyle`` is not ``Block``) and parentheses. /// \endnote BAS_BlockIndent, }; @@ -2555,29 +2555,67 @@ struct FormatStyle { /// \version 3.7 unsigned ContinuationIndentWidth; - /// If ``true``, format braced lists as best suited for C++11 braced - /// lists. - /// - /// Important differences: - /// - /// * No spaces inside the braced list. - /// * No line break before the closing brace. - /// * Indentation with the continuation indent, not with the block indent. - /// - /// Fundamentally, C++11 braced lists are formatted exactly like function - /// calls would be formatted in their place. If the braced list follows a name - /// (e.g. a type or variable name), clang-format formats as if the ``{}`` were - /// the parentheses of a function call with that name. If there is no name, - /// a zero-length name is assumed. - /// \code - /// true: false: - /// vector<int> x{1, 2, 3, 4}; vs. vector<int> x{ 1, 2, 3, 4 }; - /// vector<T> x{{}, {}, {}, {}}; vector<T> x{ {}, {}, {}, {} }; - /// f(MyMap[{composite, key}]); f(MyMap[{ composite, key }]); - /// new int[3]{1, 2, 3}; new int[3]{ 1, 2, 3 }; - /// \endcode + /// Different ways to handle braced lists. + enum BracedListStyle : int8_t { + /// Best suited for pre C++11 braced lists. + /// + /// * Spaces inside the braced list. + /// * Line break before the closing brace. + /// * Indentation with the block indent. + /// + /// \code + /// vector<int> x{ 1, 2, 3, 4 }; + /// vector<T> x{ {}, {}, {}, {} }; + /// f(MyMap[{ composite, key }]); + /// new int[3]{ 1, 2, 3 }; + /// Type name{ // Comment + /// value + /// }; + /// \endcode + BLS_Block, + /// Best suited for C++11 braced lists. + /// + /// * No spaces inside the braced list. + /// * No line break before the closing brace. + /// * Indentation with the continuation indent. + /// + /// Fundamentally, C++11 braced lists are formatted exactly like function + /// calls would be formatted in their place. If the braced list follows a + /// name (e.g. a type or variable name), clang-format formats as if the + /// ``{}`` were the parentheses of a function call with that name. If there + /// is no name, a zero-length name is assumed. + /// \code + /// vector<int> x{1, 2, 3, 4}; + /// vector<T> x{{}, {}, {}, {}}; + /// f(MyMap[{composite, key}]); + /// new int[3]{1, 2, 3}; + /// Type name{ // Comment + /// value}; + /// \endcode + BLS_FunctionCall, + /// Same as ``FunctionCall``, except for the handling of a comment at the + /// begin, it then aligns everything following with the comment. + /// + /// * No spaces inside the braced list. (Even for a comment at the first + /// position.) + /// * No line break before the closing brace. + /// * Indentation with the continuation indent, except when followed by a + /// line comment, then it uses the block indent. + /// + /// \code + /// vector<int> x{1, 2, 3, 4}; + /// vector<T> x{{}, {}, {}, {}}; + /// f(MyMap[{composite, key}]); + /// new int[3]{1, 2, 3}; + /// Type name{// Comment + /// value}; + /// \endcode + BLS_AlignFirstComment, + }; + + /// The style to handle braced lists. /// \version 3.4 - bool Cpp11BracedListStyle; + BracedListStyle Cpp11BracedListStyle; /// This option is **deprecated**. See ``DeriveLF`` and ``DeriveCRLF`` of /// ``LineEnding``. @@ -4933,7 +4971,7 @@ struct FormatStyle { /// Specifies when to insert a space in empty braces. /// \note /// This option doesn't apply to initializer braces if - /// ``Cpp11BracedListStyle`` is set to ``true``. + /// ``Cpp11BracedListStyle`` is not ``Block``. /// \endnote /// \version 22 SpaceInEmptyBracesStyle SpaceInEmptyBraces; diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h index c233ca1..4aee165 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h @@ -211,6 +211,16 @@ protected: getExtraInvalidatedValues(ValueList &Values, RegionAndSymbolInvalidationTraits *ETraits) const {} + /// A state for looking up relevant Environment entries (arguments, return + /// value), dynamic type information and similar "stable" things. + /// WARNING: During the evaluation of a function call, several state + /// transitions happen, so this state can become partially obsolete! + /// + /// TODO: Instead of storing a complete state object in the CallEvent, only + /// store the relevant parts (such as argument/return SVals etc.) that aren't + /// allowed to become obsolete until the end of the call evaluation. + ProgramStateRef getState() const { return State; } + public: CallEvent &operator=(const CallEvent &) = delete; virtual ~CallEvent() = default; @@ -231,8 +241,11 @@ public: } void setForeign(bool B) const { Foreign = B; } - /// The state in which the call is being evaluated. - const ProgramStateRef &getState() const { return State; } + /// NOTE: There are plans for refactoring that would eliminate this method. + /// Prefer to use CheckerContext::getASTContext if possible! + const ASTContext &getASTContext() const { + return getState()->getStateManager().getContext(); + } /// The context in which the call is being evaluated. const LocationContext *getLocationContext() const { return LCtx; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 5838cf8..0cb4910 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3621,6 +3621,15 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS); }); + case clang::X86::BI__builtin_ia32_pmulhrsw128: + case clang::X86::BI__builtin_ia32_pmulhrsw256: + case clang::X86::BI__builtin_ia32_pmulhrsw512: + return interp__builtin_elementwise_int_binop( + S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) { + return (llvm::APIntOps::mulsExtended(LHS, RHS).ashr(14) + 1) + .extractBits(16, 1); + }); + case clang::X86::BI__builtin_ia32_pavgb128: case clang::X86::BI__builtin_ia32_pavgw128: case clang::X86::BI__builtin_ia32_pavgb256: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 16141b2..e308c17 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11819,6 +11819,14 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case clang::X86::BI__builtin_ia32_pavgw512: return EvaluateBinOpExpr(llvm::APIntOps::avgCeilU); + case clang::X86::BI__builtin_ia32_pmulhrsw128: + case clang::X86::BI__builtin_ia32_pmulhrsw256: + case clang::X86::BI__builtin_ia32_pmulhrsw512: + return EvaluateBinOpExpr([](const APSInt &LHS, const APSInt &RHS) { + return (llvm::APIntOps::mulsExtended(LHS, RHS).ashr(14) + 1) + .extractBits(16, 1); + }); + case clang::X86::BI__builtin_ia32_pmaddubsw128: case clang::X86::BI__builtin_ia32_pmaddubsw256: case clang::X86::BI__builtin_ia32_pmaddubsw512: diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp index 29db200..994a427 100644 --- a/clang/lib/Format/BreakableToken.cpp +++ b/clang/lib/Format/BreakableToken.cpp @@ -306,8 +306,10 @@ BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators( // In Verilog, all strings are quoted by double quotes, joined by commas, // and wrapped in braces. The comma is always before the newline. assert(QuoteStyle == DoubleQuotes); - LeftBraceQuote = Style.Cpp11BracedListStyle ? "{\"" : "{ \""; - RightBraceQuote = Style.Cpp11BracedListStyle ? "\"}" : "\" }"; + LeftBraceQuote = + Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? "{\"" : "{ \""; + RightBraceQuote = + Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? "\"}" : "\" }"; Postfix = "\","; Prefix = "\""; } else { diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index b7d8569..26a9542 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -833,7 +833,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, auto IsOpeningBracket = [&](const FormatToken &Tok) { auto IsStartOfBracedList = [&]() { return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) && - Style.Cpp11BracedListStyle; + Style.Cpp11BracedListStyle != FormatStyle::BLS_Block; }; if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) && !IsStartOfBracedList()) { @@ -925,7 +925,12 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, TT_TableGenDAGArgOpenerToBreak) && !(Current.MacroParent && Previous.MacroParent) && (Current.isNot(TT_LineComment) || - Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen)) && + (Previous.is(BK_BracedInit) && + (Style.Cpp11BracedListStyle != FormatStyle::BLS_FunctionCall || + !Previous.Previous || + Previous.Previous->isNoneOf(tok::identifier, tok::l_paren, + BK_BracedInit))) || + Previous.is(TT_VerilogMultiLineListLParen)) && !IsInTemplateString(Current)) { CurrentState.Indent = State.Column + Spaces; CurrentState.IsAligned = true; diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 093e88f..edd126c 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -304,6 +304,18 @@ struct ScalarEnumerationTraits<FormatStyle::BreakTemplateDeclarationsStyle> { } }; +template <> struct ScalarEnumerationTraits<FormatStyle::BracedListStyle> { + static void enumeration(IO &IO, FormatStyle::BracedListStyle &Value) { + IO.enumCase(Value, "Block", FormatStyle::BLS_Block); + IO.enumCase(Value, "FunctionCall", FormatStyle::BLS_FunctionCall); + IO.enumCase(Value, "AlignFirstComment", FormatStyle::BLS_AlignFirstComment); + + // For backward compatibility. + IO.enumCase(Value, "false", FormatStyle::BLS_Block); + IO.enumCase(Value, "true", FormatStyle::BLS_AlignFirstComment); + } +}; + template <> struct ScalarEnumerationTraits<FormatStyle::DAGArgStyle> { static void enumeration(IO &IO, FormatStyle::DAGArgStyle &Value) { IO.enumCase(Value, "DontBreak", FormatStyle::DAS_DontBreak); @@ -1628,7 +1640,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.CompactNamespaces = false; LLVMStyle.ConstructorInitializerIndentWidth = 4; LLVMStyle.ContinuationIndentWidth = 4; - LLVMStyle.Cpp11BracedListStyle = true; + LLVMStyle.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment; LLVMStyle.DerivePointerAlignment = false; LLVMStyle.DisableFormat = false; LLVMStyle.EmptyLineAfterAccessModifier = FormatStyle::ELAAMS_Never; @@ -1904,7 +1916,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) { // beneficial there. Investigate turning this on once proper string reflow // has been implemented. GoogleStyle.BreakStringLiterals = false; - GoogleStyle.Cpp11BracedListStyle = false; + GoogleStyle.Cpp11BracedListStyle = FormatStyle::BLS_Block; GoogleStyle.SpacesInContainerLiterals = false; } else if (Language == FormatStyle::LK_ObjC) { GoogleStyle.AlwaysBreakBeforeMultilineStrings = false; @@ -2000,7 +2012,7 @@ FormatStyle getMozillaStyle() { MozillaStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes; MozillaStyle.ConstructorInitializerIndentWidth = 2; MozillaStyle.ContinuationIndentWidth = 2; - MozillaStyle.Cpp11BracedListStyle = false; + MozillaStyle.Cpp11BracedListStyle = FormatStyle::BLS_Block; MozillaStyle.FixNamespaceComments = false; MozillaStyle.IndentCaseLabels = true; MozillaStyle.ObjCSpaceAfterProperty = true; @@ -2023,7 +2035,7 @@ FormatStyle getWebKitStyle() { Style.BreakBeforeBraces = FormatStyle::BS_WebKit; Style.BreakConstructorInitializers = FormatStyle::BCIS_BeforeComma; Style.ColumnLimit = 0; - Style.Cpp11BracedListStyle = false; + Style.Cpp11BracedListStyle = FormatStyle::BLS_Block; Style.FixNamespaceComments = false; Style.IndentWidth = 4; Style.NamespaceIndentation = FormatStyle::NI_Inner; @@ -2043,7 +2055,7 @@ FormatStyle getGNUStyle() { Style.BreakBeforeBraces = FormatStyle::BS_GNU; Style.BreakBeforeTernaryOperators = true; Style.ColumnLimit = 79; - Style.Cpp11BracedListStyle = false; + Style.Cpp11BracedListStyle = FormatStyle::BLS_Block; Style.FixNamespaceComments = false; Style.KeepFormFeed = true; Style.SpaceBeforeParens = FormatStyle::SBPO_Always; diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index cf02280..d1c6264 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -67,7 +67,7 @@ bool FormatToken::isBlockIndentedInitRBrace(const FormatStyle &Style) const { assert(is(tok::r_brace)); assert(MatchingParen); assert(MatchingParen->is(tok::l_brace)); - if (!Style.Cpp11BracedListStyle || + if (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block || Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent) { return false; } @@ -88,7 +88,8 @@ bool FormatToken::opensBlockOrBlockTypeList(const FormatStyle &Style) const { return is(TT_ArrayInitializerLSquare) || is(TT_ProtoExtensionLSquare) || (is(tok::l_brace) && (getBlockKind() == BK_Block || is(TT_DictLiteral) || - (!Style.Cpp11BracedListStyle && NestingLevel == 0))) || + (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block && + NestingLevel == 0))) || (is(tok::less) && Style.isProto()); } @@ -184,7 +185,8 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) { // In C++11 braced list style, we should not format in columns unless they // have many items (20 or more) or we allow bin-packing of function call // arguments. - if (Style.Cpp11BracedListStyle && !Style.BinPackArguments && + if (Style.Cpp11BracedListStyle != FormatStyle::BLS_Block && + !Style.BinPackArguments && (Commas.size() < 19 || !Style.BinPackLongBracedList)) { return; } @@ -228,7 +230,7 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) { ItemEnd = Token->MatchingParen; const FormatToken *NonCommentEnd = ItemEnd->getPreviousNonComment(); ItemLengths.push_back(CodePointsBetween(ItemBegin, NonCommentEnd)); - if (Style.Cpp11BracedListStyle && + if (Style.Cpp11BracedListStyle != FormatStyle::BLS_Block && !ItemEnd->Previous->isTrailingComment()) { // In Cpp11 braced list style, the } and possibly other subsequent // tokens will need to stay on a line with the last element. diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index ffbd383..778d2ca 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4094,7 +4094,8 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { if (Current->is(TT_LineComment)) { if (Prev->is(BK_BracedInit) && Prev->opensScope()) { Current->SpacesRequiredBefore = - (Style.Cpp11BracedListStyle && !Style.SpacesInParensOptions.Other) + (Style.Cpp11BracedListStyle == FormatStyle::BLS_AlignFirstComment && + !Style.SpacesInParensOptions.Other) ? 0 : 1; } else if (Prev->is(TT_VerilogMultiLineListLParen)) { @@ -4445,8 +4446,10 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line, (Left.ParameterCount <= 1 || Style.AllowAllArgumentsOnNextLine)) { return 0; } - if (Left.is(tok::l_brace) && !Style.Cpp11BracedListStyle) + if (Left.is(tok::l_brace) && + Style.Cpp11BracedListStyle == FormatStyle::BLS_Block) { return 19; + } return Left.ParameterCount > 1 ? Style.PenaltyBreakBeforeFirstCallParameter : 19; } @@ -4612,7 +4615,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, // Format empty list as `<>`. if (Left.is(tok::less) && Right.is(tok::greater)) return false; - return !Style.Cpp11BracedListStyle; + return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block; } // Don't attempt to format operator<(), as it is handled later. if (Right.isNot(TT_OverloadedOperatorLParen)) @@ -4780,7 +4783,8 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, const auto SpaceRequiredForArrayInitializerLSquare = [](const FormatToken &LSquareTok, const FormatStyle &Style) { return Style.SpacesInContainerLiterals || - (Style.isProto() && !Style.Cpp11BracedListStyle && + (Style.isProto() && + Style.Cpp11BracedListStyle == FormatStyle::BLS_Block && LSquareTok.endsSequence(tok::l_square, tok::colon, TT_SelectorName)); }; @@ -4813,7 +4817,8 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, if ((Left.is(tok::l_brace) && Left.isNot(BK_Block)) || (Right.is(tok::r_brace) && Right.MatchingParen && Right.MatchingParen->isNot(BK_Block))) { - return !Style.Cpp11BracedListStyle || Style.SpacesInParensOptions.Other; + return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block || + Style.SpacesInParensOptions.Other; } if (Left.is(TT_BlockComment)) { // No whitespace in x(/*foo=*/1), except for JavaScript. @@ -4995,7 +5000,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, Left.Children.empty()) { if (Left.is(BK_Block)) return Style.SpaceInEmptyBraces != FormatStyle::SIEB_Never; - if (Style.Cpp11BracedListStyle) { + if (Style.Cpp11BracedListStyle != FormatStyle::BLS_Block) { return Style.SpacesInParens == FormatStyle::SIPO_Custom && Style.SpacesInParensOptions.InEmptyParentheses; } @@ -5077,7 +5082,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, if (Left.MatchingParen && Left.MatchingParen->is(TT_ProtoExtensionLSquare) && Right.isOneOf(tok::l_brace, tok::less)) { - return !Style.Cpp11BracedListStyle; + return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block; } // A percent is probably part of a formatting specification, such as %lld. if (Left.is(tok::percent)) @@ -5517,7 +5522,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, if (Left.is(tok::greater) && Right.is(tok::greater)) { if (Style.isTextProto() || (Style.Language == FormatStyle::LK_Proto && Left.is(TT_DictLiteral))) { - return !Style.Cpp11BracedListStyle; + return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block; } return Right.is(TT_TemplateCloser) && Left.is(TT_TemplateCloser) && ((Style.Standard < FormatStyle::LS_Cpp11) || @@ -6378,7 +6383,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, return false; } if (Left.is(tok::equal) && Right.is(tok::l_brace) && - !Style.Cpp11BracedListStyle) { + Style.Cpp11BracedListStyle == FormatStyle::BLS_Block) { return false; } if (Left.is(TT_AttributeLParen) || diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp index 7348a3a..9261294 100644 --- a/clang/lib/Format/WhitespaceManager.cpp +++ b/clang/lib/Format/WhitespaceManager.cpp @@ -1238,7 +1238,8 @@ void WhitespaceManager::alignArrayInitializersRightJustified( if (!CellDescs.isRectangular()) return; - const int BracePadding = Style.Cpp11BracedListStyle ? 0 : 1; + const int BracePadding = + Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? 0 : 1; auto &Cells = CellDescs.Cells; // Now go through and fixup the spaces. auto *CellIter = Cells.begin(); @@ -1314,7 +1315,8 @@ void WhitespaceManager::alignArrayInitializersLeftJustified( if (!CellDescs.isRectangular()) return; - const int BracePadding = Style.Cpp11BracedListStyle ? 0 : 1; + const int BracePadding = + Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? 0 : 1; auto &Cells = CellDescs.Cells; // Now go through and fixup the spaces. auto *CellIter = Cells.begin(); diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index fa7f4c2..d35bc0e 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -1650,9 +1650,8 @@ _mm256_mul_epi32(__m256i __a, __m256i __b) { /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the rounded products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mulhrs_epi16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mulhrs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); } @@ -1670,8 +1669,7 @@ _mm256_mulhrs_epi16(__m256i __a, __m256i __b) /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the products. static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mulhi_epu16(__m256i __a, __m256i __b) -{ +_mm256_mulhi_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b); } diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index 23b2d29..ac75b6c 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -1003,23 +1003,20 @@ _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mulhrs_epi16(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mulhrs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_mulhrs_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_mulhrs_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 639fb60..0fcfe37 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -1510,28 +1510,28 @@ _mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mulhrs_epi16(__X, __Y), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mulhrs_epi16(__X, __Y), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_mulhrs_epi16(__X, __Y), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_mulhrs_epi16(__X, __Y), diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index ee96caa..5d0f20f 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -544,8 +544,8 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) { /// A 128-bit vector of [8 x i16] containing one of the source operands. /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled /// products of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhrs_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mulhrs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); } @@ -563,11 +563,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhrs_epi16(__m128i __a, /// A 64-bit vector of [4 x i16] containing one of the source operands. /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled /// products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_mulhrs_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a), - (__v8hi)__anyext128(__b))); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mulhrs_pi16(__m64 __a, __m64 __b) { + return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__zext128(__a), + (__v8hi)__zext128(__b))); } /// Copies the 8-bit integers from a 128-bit integer vector to the diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 652527a..ef1be23 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -12309,13 +12309,20 @@ static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source, SourceLocation CC) { assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() && Source != Target); + + // Lone surrogates have a distinct representation in UTF-32. + // Converting between UTF-16 and UTF-32 codepoints seems very widespread, + // so don't warn on such conversion. + if (Source->isChar16Type() && Target->isChar32Type()) + return; + Expr::EvalResult Result; if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects, S.isConstantEvaluatedContext())) { llvm::APSInt Value(32); Value = Result.Val.getInt(); bool IsASCII = Value <= 0x7F; - bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF); + bool IsBMP = Value <= 0xDFFF || (Value >= 0xE000 && Value <= 0xFFFF); bool ConversionPreservesSemantics = IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP); diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index ca7e3b2..038f396 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -2864,9 +2864,9 @@ TemplateInstantiator::TransformNestedRequirement( TemplateArgs, Constraint->getSourceRange(), Satisfaction, /*TopLevelConceptId=*/nullptr, &NewConstraint); - assert(!Success || !Trap.hasErrorOccurred() && - "Substitution failures must be handled " - "by CheckConstraintSatisfaction."); + assert((!Success || !Trap.hasErrorOccurred()) && + "Substitution failures must be handled " + "by CheckConstraintSatisfaction."); } if (!Success || Satisfaction.HasSubstitutionFailure()) diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp index bf35bee..3ddd659 100644 --- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp @@ -104,7 +104,7 @@ class RAIIMutexDescriptor { // this function is called instead of early returning it. To avoid this, a // bool variable (IdentifierInfoInitialized) is used and the function will // be run only once. - const auto &ASTCtx = Call.getState()->getStateManager().getContext(); + const auto &ASTCtx = Call.getASTContext(); Guard = &ASTCtx.Idents.get(GuardName); } } diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp index 9d3aeff..2420848 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp @@ -929,7 +929,7 @@ ObjCDeallocChecker::getValueReleasedByNillingOut(const ObjCMethodCall &M, SVal Arg = M.getArgSVal(0); ProgramStateRef notNilState, nilState; std::tie(notNilState, nilState) = - M.getState()->assume(Arg.castAs<DefinedOrUnknownSVal>()); + C.getState()->assume(Arg.castAs<DefinedOrUnknownSVal>()); if (!(nilState && !notNilState)) return nullptr; diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp index f984caf..227cbfa 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp @@ -34,7 +34,7 @@ class ObjCSuperDeallocChecker this, "[super dealloc] should not be called more than once", categories::CoreFoundationObjectiveC}; - void initIdentifierInfoAndSelectors(ASTContext &Ctx) const; + void initIdentifierInfoAndSelectors(const ASTContext &Ctx) const; bool isSuperDeallocMessage(const ObjCMethodCall &M) const; @@ -214,8 +214,8 @@ void ObjCSuperDeallocChecker::diagnoseCallArguments(const CallEvent &CE, } } -void -ObjCSuperDeallocChecker::initIdentifierInfoAndSelectors(ASTContext &Ctx) const { +void ObjCSuperDeallocChecker::initIdentifierInfoAndSelectors( + const ASTContext &Ctx) const { if (IIdealloc) return; @@ -230,7 +230,7 @@ ObjCSuperDeallocChecker::isSuperDeallocMessage(const ObjCMethodCall &M) const { if (M.getOriginExpr()->getReceiverKind() != ObjCMessageExpr::SuperInstance) return false; - ASTContext &Ctx = M.getState()->getStateManager().getContext(); + const ASTContext &Ctx = M.getASTContext(); initIdentifierInfoAndSelectors(Ctx); return M.getSelector() == SELdealloc; diff --git a/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp index 4fc1c57..db8bbee 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp @@ -211,13 +211,13 @@ private: if (!DefaultType) return; - ProgramStateRef State = ConstructorCall->getState(); + ProgramStateRef State = C.getState(); State = State->set<VariantHeldTypeMap>(ThisMemRegion, *DefaultType); C.addTransition(State); } bool handleStdGetCall(const CallEvent &Call, CheckerContext &C) const { - ProgramStateRef State = Call.getState(); + ProgramStateRef State = C.getState(); const auto &ArgType = Call.getArgSVal(0) .getType(C.getASTContext()) diff --git a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h index dec4612..b8fb572 100644 --- a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h +++ b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h @@ -52,7 +52,7 @@ removeInformationStoredForDeadInstances(const CallEvent &Call, template <class TypeMap> void handleConstructorAndAssignment(const CallEvent &Call, CheckerContext &C, SVal ThisSVal) { - ProgramStateRef State = Call.getState(); + ProgramStateRef State = C.getState(); if (!State) return; diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index b798618..a505d70 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1038,6 +1038,7 @@ __m256i test_mm256_mulhrs_epi16(__m256i a, __m256i b) { // CHECK: call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_mulhrs_epi16(a, b); } +TEST_CONSTEXPR(match_v16hi(_mm256_mulhrs_epi16((__m256i)(__v16hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600}, (__m256i)(__v16hi){+1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +5, -9, -13, +16, +18, -20, -21, -22, -22, +21, +20, -18, -16, +13, +9, -5)); __m256i test_mm256_mullo_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_mullo_epi16 diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index fddf17d..55bf482 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -1596,18 +1596,24 @@ __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) { // CHECK: @llvm.x86.avx512.pmul.hr.sw.512 return _mm512_mulhrs_epi16(__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mulhrs_epi16((__m512i)(__v32hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600, -1700, -1800, +1900, +2000, -2100, -2200, +2300, +2400, -2500, -2600, +2700, +2800, -2900, -3000, +3100, +3200}, (__m512i)(__v32hi){+3200, -3100, +3000, -2900, +2800, -2700, +2600, -2500, +2400, -2300, +2200, -2100, +2000, -1900, +1800, -1700, +1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +10, -19, -27, +35, +43, -49, -56, -61, -66, +70, +74, -77, -79, +81, +82, -83, -83, +82, +81, -79, -77, +74, +70, -66, -61, +56, +49, -43, -35, +27, +19, -10)); + __m512i test_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_mulhrs_epi16 // CHECK: @llvm.x86.avx512.pmul.hr.sw.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_mulhrs_epi16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_mulhrs_epi16(_mm512_set1_epi16(1), 0x0000FFFF, (__m512i)(__v32hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600, -1700, -1800, +1900, +2000, -2100, -2200, +2300, +2400, -2500, -2600, +2700, +2800, -2900, -3000, +3100, +3200}, (__m512i)(__v32hi){+3200, -3100, +3000, -2900, +2800, -2700, +2600, -2500, +2400, -2300, +2200, -2100, +2000, -1900, +1800, -1700, +1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +10, -19, -27, +35, +43, -49, -56, -61, -66, +70, +74, -77, -79, +81, +82, -83, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1)); + __m512i test_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_mulhrs_epi16 // CHECK: @llvm.x86.avx512.pmul.hr.sw.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_mulhrs_epi16(__U,__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_mulhrs_epi16(0x0000FFFF, (__m512i)(__v32hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600, -1700, -1800, +1900, +2000, -2100, -2200, +2300, +2400, -2500, -2600, +2700, +2800, -2900, -3000, +3100, +3200}, (__m512i)(__v32hi){+3200, -3100, +3000, -2900, +2800, -2700, +2600, -2500, +2400, -2300, +2200, -2100, +2000, -1900, +1800, -1700, +1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +10, -19, -27, +35, +43, -49, -56, -61, -66, +70, +74, -77, -79, +81, +82, -83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); + __m512i test_mm512_mulhi_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mulhi_epi16 // CHECK: @llvm.x86.avx512.pmulh.w.512 diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index d569283..95e4d40 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -2061,6 +2061,7 @@ __m128i test_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_mulhrs_epi16(__W, __U, __X, __Y); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_mulhrs_epi16(_mm_set1_epi16(1), 0x0F, (__m128i)(__v8hi){+100, +200, -300, -400, +500, +600, -700, +800}, (__m128i)(__v8hi){+8000, -7000, +6000, -5000, +4000, -3000, +2000, -1000}), +24, -43, -55, +61, +1, +1, +1, +1)); __m128i test_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { // CHECK-LABEL: test_mm_maskz_mulhrs_epi16 @@ -2068,6 +2069,7 @@ __m128i test_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_mulhrs_epi16(__U, __X, __Y); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_mulhrs_epi16(0x0F, (__m128i)(__v8hi){+100, +200, -300, -400, +500, +600, -700, +800}, (__m128i)(__v8hi){+8000, -7000, +6000, -5000, +4000, -3000, +2000, -1000}), +24, -43, -55, +61, 0, 0, 0, 0)); __m256i test_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { // CHECK-LABEL: test_mm256_mask_mulhrs_epi16 @@ -2075,6 +2077,7 @@ __m256i test_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __ // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_mulhrs_epi16(__W, __U, __X, __Y); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_mulhrs_epi16(_mm256_set1_epi16(1), 0xF00F, (__m256i)(__v16hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600}, (__m256i)(__v16hi){+1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +5, -9, -13, +16, +1, +1, +1, +1, +1, +1, +1, +1, -16, +13, +9, -5)); __m256i test_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { // CHECK-LABEL: test_mm256_maskz_mulhrs_epi16 @@ -2082,6 +2085,7 @@ __m256i test_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_mulhrs_epi16(__U, __X, __Y); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_mulhrs_epi16(0xF00F, (__m256i)(__v16hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600}, (__m256i)(__v16hi){+1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +5, -9, -13, +16, 0, 0, 0, 0, 0, 0, 0, 0, -16, +13, +9, -5)); __m128i test_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_mulhi_epu16 diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index d9041d4..c1ac57b 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -438,6 +438,7 @@ __m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) { // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128( return _mm_mulhrs_pi16(a, b); } +TEST_CONSTEXPR(match_v4hi(_mm_mulhrs_pi16((__m64)(__v4hi){+100, +200, -300, -400}, (__m64)(__v4hi){+30000, -20000, +10000, -5000}), +92, -122, -92, +61)); __m64 test_mm_mullo_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mullo_pi16 diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c index 32abd9d..f70afc0 100644 --- a/clang/test/CodeGen/X86/ssse3-builtins.c +++ b/clang/test/CodeGen/X86/ssse3-builtins.c @@ -110,6 +110,7 @@ __m128i test_mm_mulhrs_epi16(__m128i a, __m128i b) { // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_mulhrs_epi16(a, b); } +TEST_CONSTEXPR(match_v8hi(_mm_mulhrs_epi16((__m128i)(__v8hi){+100, +200, -300, -400, +500, +600, -700, +800}, (__m128i)(__v8hi){+8000, -7000, +6000, -5000, +4000, -3000, +2000, -1000}), +24, -43, -55, +61, +61, -55, -43, -24)); __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_shuffle_epi8 diff --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp index fcff006..f17f20c 100644 --- a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp +++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp @@ -14,7 +14,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) { c16(u32); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' may lose precision and change the meaning of the represented code unit}} c32(u8); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' may change the meaning of the represented code unit}} - c32(u16); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' may change the meaning of the represented code unit}} + c32(u16); c32(u32); @@ -30,7 +30,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) { c16(char32_t(0x7f)); c16(char32_t(0x80)); c16(char32_t(0xD7FF)); - c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}} + c16(char32_t(0xD800)); c16(char32_t(0xE000)); c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code point '🐉'}} @@ -44,8 +44,8 @@ void test(char8_t u8, char16_t u16, char32_t u32) { c32(char16_t(0x80)); c32(char16_t(0xD7FF)); - c32(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xD800>'}} - c32(char16_t(0xDFFF)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xDFFF>'}} + c32(char16_t(0xD800)); + c32(char16_t(0xDFFF)); c32(char16_t(0xE000)); c32(char16_t(u'☕')); diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 52f02c3..6488e38 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -176,7 +176,6 @@ TEST(ConfigParseTest, ParsesConfigurationBools) { CHECK_PARSE_BOOL(BreakBeforeTernaryOperators); CHECK_PARSE_BOOL(BreakStringLiterals); CHECK_PARSE_BOOL(CompactNamespaces); - CHECK_PARSE_BOOL(Cpp11BracedListStyle); CHECK_PARSE_BOOL(DerivePointerAlignment); CHECK_PARSE_BOOL_FIELD(DerivePointerAlignment, "DerivePointerBinding"); CHECK_PARSE_BOOL(DisableFormat); @@ -1139,6 +1138,18 @@ TEST(ConfigParseTest, ParsesConfiguration) { FormatStyle::SDS_Leave); CHECK_PARSE("SeparateDefinitionBlocks: Never", SeparateDefinitionBlocks, FormatStyle::SDS_Never); + + CHECK_PARSE("Cpp11BracedListStyle: Block", Cpp11BracedListStyle, + FormatStyle::BLS_Block); + CHECK_PARSE("Cpp11BracedListStyle: FunctionCall", Cpp11BracedListStyle, + FormatStyle::BLS_FunctionCall); + CHECK_PARSE("Cpp11BracedListStyle: AlignFirstComment", Cpp11BracedListStyle, + FormatStyle::BLS_AlignFirstComment); + // For backward compatibility: + CHECK_PARSE("Cpp11BracedListStyle: false", Cpp11BracedListStyle, + FormatStyle::BLS_Block); + CHECK_PARSE("Cpp11BracedListStyle: true", Cpp11BracedListStyle, + FormatStyle::BLS_AlignFirstComment); } TEST(ConfigParseTest, ParsesConfigurationWithLanguages) { diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index b9ad930..0fb8139 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -14363,7 +14363,7 @@ TEST_F(FormatTest, LayoutCxx11BraceInitializers) { BreakBeforeLambdaBody); FormatStyle ExtraSpaces = getLLVMStyle(); - ExtraSpaces.Cpp11BracedListStyle = false; + ExtraSpaces.Cpp11BracedListStyle = FormatStyle::BLS_Block; ExtraSpaces.ColumnLimit = 75; verifyFormat("vector<int> x{ 1, 2, 3, 4 };", ExtraSpaces); verifyFormat("vector<T> x{ {}, {}, {}, {} };", ExtraSpaces); @@ -20346,7 +20346,7 @@ TEST_F(FormatTest, AlignConsecutiveDeclarations) { " return 0;\n" "}()};", BracedAlign); - BracedAlign.Cpp11BracedListStyle = false; + BracedAlign.Cpp11BracedListStyle = FormatStyle::BLS_Block; verifyFormat("const auto result{ []() {\n" " const auto something = 1;\n" " return 2;\n" @@ -21953,14 +21953,14 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresRightAlignment) { "});", Style); - Style.Cpp11BracedListStyle = false; + Style.Cpp11BracedListStyle = FormatStyle::BLS_Block; verifyFormat("struct test demo[] = {\n" " { 56, 23, \"hello\" },\n" " { -1, 93463, \"world\" },\n" " { 7, 5, \"!!\" }\n" "};", Style); - Style.Cpp11BracedListStyle = true; + Style.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment; Style.ColumnLimit = 0; verifyFormat( @@ -22220,14 +22220,14 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresLeftAlignment) { " };", Style); - Style.Cpp11BracedListStyle = false; + Style.Cpp11BracedListStyle = FormatStyle::BLS_Block; verifyFormat("struct test demo[] = {\n" " { 56, 23, \"hello\" },\n" " { -1, 93463, \"world\" },\n" " { 7, 5, \"!!\" }\n" "};", Style); - Style.Cpp11BracedListStyle = true; + Style.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment; Style.ColumnLimit = 0; verifyFormat( diff --git a/clang/unittests/Format/FormatTestCSharp.cpp b/clang/unittests/Format/FormatTestCSharp.cpp index ea85ed6..d7fb15d 100644 --- a/clang/unittests/Format/FormatTestCSharp.cpp +++ b/clang/unittests/Format/FormatTestCSharp.cpp @@ -1194,7 +1194,7 @@ TEST_F(FormatTestCSharp, CSharpSpaces) { Style.SpaceBeforeSquareBrackets = false; Style.SpacesInSquareBrackets = false; Style.SpaceBeforeCpp11BracedList = true; - Style.Cpp11BracedListStyle = false; + Style.Cpp11BracedListStyle = FormatStyle::BLS_Block; Style.SpacesInContainerLiterals = false; Style.SpaceAfterCStyleCast = false; diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp index 69026bc..fc80bf4 100644 --- a/clang/unittests/Format/FormatTestComments.cpp +++ b/clang/unittests/Format/FormatTestComments.cpp @@ -4699,6 +4699,58 @@ TEST_F(FormatTestComments, SplitCommentIntroducers) { getLLVMStyleWithColumns(10))); } +TEST_F(FormatTestComments, LineCommentsOnStartOfFunctionCall) { + auto Style = getLLVMStyle(); + + EXPECT_EQ(Style.Cpp11BracedListStyle, FormatStyle::BLS_AlignFirstComment); + verifyFormat("Type name{// Comment\n" + " value};", + Style); + + Style.Cpp11BracedListStyle = FormatStyle::BLS_Block; + verifyFormat("Type name{ // Comment\n" + " value\n" + "};", + Style); + + Style.Cpp11BracedListStyle = FormatStyle::BLS_FunctionCall; + verifyFormat("Type name{ // Comment\n" + " value};", + Style); + + verifyFormat("T foo( // Comment\n" + " arg);", + Style); + + verifyFormat("T bar{ // Comment\n" + " arg};", + Style); + + verifyFormat("T baz({ // Comment\n" + " arg});", + Style); + + verifyFormat("T baz{{ // Comment\n" + " arg}};", + Style); + + verifyFormat("T b0z(f( // Comment\n" + " arg));", + Style); + + verifyFormat("T b0z(F{ // Comment\n" + " arg});", + Style); + + verifyFormat("func( // Comment\n" + " arg);", + Style); + + verifyFormat("func({ // Comment\n" + " arg});", + Style); +} + } // end namespace } // namespace test } // end namespace format diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp index 1275564..1416614b 100644 --- a/clang/unittests/Format/FormatTestJava.cpp +++ b/clang/unittests/Format/FormatTestJava.cpp @@ -236,7 +236,7 @@ TEST_F(FormatTestJava, ArrayInitializers) { "};"); FormatStyle Style = getStyleWithColumns(65); - Style.Cpp11BracedListStyle = false; + Style.Cpp11BracedListStyle = FormatStyle::BLS_Block; verifyFormat( "expected = new int[] { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,\n" " 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 };", diff --git a/clang/unittests/Format/FormatTestTextProto.cpp b/clang/unittests/Format/FormatTestTextProto.cpp index fd65c9a..6cddb838 100644 --- a/clang/unittests/Format/FormatTestTextProto.cpp +++ b/clang/unittests/Format/FormatTestTextProto.cpp @@ -514,7 +514,7 @@ TEST_F(FormatTestTextProto, FormatsRepeatedListInitializers) { "key: value"); auto Style = getDefaultStyle(); - Style.Cpp11BracedListStyle = true; + Style.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment; verifyFormat("keys: [1]", Style); } diff --git a/clang/unittests/Format/FormatTestVerilog.cpp b/clang/unittests/Format/FormatTestVerilog.cpp index 5c50ae6..63e2cadf 100644 --- a/clang/unittests/Format/FormatTestVerilog.cpp +++ b/clang/unittests/Format/FormatTestVerilog.cpp @@ -1287,7 +1287,7 @@ TEST_F(FormatTestVerilog, StringLiteral) { getStyleWithColumns(getDefaultStyle(), 32)); // Space around braces should be correct. auto Style = getStyleWithColumns(getDefaultStyle(), 24); - Style.Cpp11BracedListStyle = false; + Style.Cpp11BracedListStyle = FormatStyle::BLS_Block; verifyFormat(R"(x({ "xxxxxxxxxxxxxxxx ", "xxxx" });)", R"(x("xxxxxxxxxxxxxxxx xxxx");)", Style); diff --git a/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp b/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp index 6f02814..8d1b9ef1 100644 --- a/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp @@ -7,13 +7,14 @@ // This code is based on the repro in https://github.com/google/sanitizers/issues/749 #include <cstdio> #include <exception> +#include <stdexcept> -void throwInFunction() { throw std::exception("test2"); } +void throwInFunction() { throw std::runtime_error("test2"); } int main() { // case 1: direct throw try { - throw std::exception("test1"); + throw std::runtime_error("test1"); } catch (const std::exception &ex) { puts(ex.what()); // CHECK: test1 diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt index 2427da0..ed177ba 100644 --- a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt @@ -5,8 +5,10 @@ add_flang_library(FIROpenACCTransforms FIROpenACCPassesIncGen LINK_LIBS + FIRDialect + + MLIR_LIBS MLIRIR MLIRPass - FIRDialect MLIROpenACCDialect ) diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst index 642bf7d..2b1aa28 100644 --- a/libcxx/docs/ReleaseNotes/21.rst +++ b/libcxx/docs/ReleaseNotes/21.rst @@ -53,8 +53,7 @@ Implemented Papers - P2711R1: Making multi-param constructors of ``views`` ``explicit`` (`Github <https://github.com/llvm/llvm-project/issues/105252>`__) - P2770R0: Stashing stashing ``iterators`` for proper flattening (`Github <https://github.com/llvm/llvm-project/issues/105250>`__) - P2655R3: ``common_reference_t`` of ``reference_wrapper`` Should Be a Reference Type (`Github <https://github.com/llvm/llvm-project/issues/105260>`__) -- P2944R3: Comparisons for ``reference_wrapper`` (`Github <https://github.com/llvm/llvm-project/issues/105424>`__) -- P3379R0: Constrain ``std::expected equality`` operators (`Github <https://github.com/llvm/llvm-project/issues/118135>`__) +- P3379R0: Constrain ``std::expected`` equality operators (`Github <https://github.com/llvm/llvm-project/issues/118135>`__) Improvements and New Features ----------------------------- diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst index 0fd7e53..1a7a2c7 100644 --- a/libcxx/docs/ReleaseNotes/22.rst +++ b/libcxx/docs/ReleaseNotes/22.rst @@ -44,6 +44,7 @@ Implemented Papers - P3223R2: Making ``std::istream::ignore`` less surprising (`Github <https://llvm.org/PR148178>`__) - P3060R3: Add ``std::views::indices(n)`` (`Github <https://llvm.org/PR148175>`__) - P2835R7: Expose ``std::atomic_ref``'s object address (`Github <https://llvm.org/PR118377>`__) +- P2944R3: Comparisons for ``reference_wrapper`` (`Github <https://llvm.org/PR105424>`__) - P3168R2: Give ``std::optional`` Range Support (`Github <https://llvm.org/PR105430>`__) Improvements and New Features diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv index 7bf7bc9..237217a 100644 --- a/libcxx/docs/Status/Cxx2cIssues.csv +++ b/libcxx/docs/Status/Cxx2cIssues.csv @@ -149,5 +149,5 @@ "`LWG3343 <https://wg21.link/LWG3343>`__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Adopted Yet","|Complete|","16","`#105356 <https://github.com/llvm/llvm-project/issues/105356>`__","" "`LWG4139 <https://wg21.link/LWG4139>`__","§[time.zone.leap] recursive constraint in ``<=>``","Not Adopted Yet","|Complete|","20","`#118369 <https://github.com/llvm/llvm-project/issues/118369>`__","" "`LWG3456 <https://wg21.link/LWG3456>`__","Pattern used by ``std::from_chars`` is underspecified (option B)","Not Adopted Yet","|Complete|","20","`#118370 <https://github.com/llvm/llvm-project/issues/118370>`__","" -"`LWG3882 <https://wg21.link/LWG3882>`__","``tuple`` relational operators have confused friendships","Not Adopted Yet","|Complete|","21","The comparsion operators are constrained harder than the proposed resolution. libstdc++ and MSVC STL do the same.","" +"`LWG3882 <https://wg21.link/LWG3882>`__","``tuple`` relational operators have confused friendships","Not Adopted Yet","|Complete|","22","The comparsion operators are constrained harder than the proposed resolution. libstdc++ and MSVC STL do the same.","" "","","","","","" diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv index 9b83047..0eedc82 100644 --- a/libcxx/docs/Status/Cxx2cPapers.csv +++ b/libcxx/docs/Status/Cxx2cPapers.csv @@ -59,7 +59,7 @@ "`P2248R8 <https://wg21.link/P2248R8>`__","Enabling list-initialization for algorithms","2024-03 (Tokyo)","","","`#105421 <https://github.com/llvm/llvm-project/issues/105421>`__","" "`P2810R4 <https://wg21.link/P2810R4>`__","``is_debugger_present`` ``is_replaceable``","2024-03 (Tokyo)","","","`#105422 <https://github.com/llvm/llvm-project/issues/105422>`__","" "`P1068R11 <https://wg21.link/P1068R11>`__","Vector API for random number generation","2024-03 (Tokyo)","","","`#105423 <https://github.com/llvm/llvm-project/issues/105423>`__","" -"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Complete|","21","`#105424 <https://github.com/llvm/llvm-project/issues/105424>`__","The changes to ``tuple``'s equality overload from P2165R4 are not yet implemented." +"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Complete|","22","`#105424 <https://github.com/llvm/llvm-project/issues/105424>`__","The changes to ``tuple``'s equality overload from P2165R4 are not yet implemented." "`P2642R6 <https://wg21.link/P2642R6>`__","Padded ``mdspan`` layouts","2024-03 (Tokyo)","","","`#105425 <https://github.com/llvm/llvm-project/issues/105425>`__","" "`P3029R1 <https://wg21.link/P3029R1>`__","Better ``mdspan``'s CTAD","2024-03 (Tokyo)","|Complete|","19","`#105426 <https://github.com/llvm/llvm-project/issues/105426>`__","" "","","","","","" diff --git a/llvm/benchmarks/SpecialCaseListBM.cpp b/llvm/benchmarks/SpecialCaseListBM.cpp index 00aa3cd..b5d8268 100644 --- a/llvm/benchmarks/SpecialCaseListBM.cpp +++ b/llvm/benchmarks/SpecialCaseListBM.cpp @@ -110,6 +110,26 @@ std::string genGlobAtBothSides(const std::vector<std::string> &Files) { return S; } +std::string genGlobAtBothSidesAndMid(const std::vector<std::string> &Files) { + std::string S; + std::minstd_rand Rng(RNG_SEED); + for (std::string F : Files) { + std::uniform_int_distribution<> PosDistrib(0, F.size() - 1); + F[PosDistrib(Rng)] = '*'; + + std::uniform_int_distribution<> Ends(0, 1); + if (Ends(Rng)) { + F.back() = '*'; + F.front() = '*'; + } + + S += "src:"; + S += F; + S += "\n"; + } + return S; +} + void BM_Make_( benchmark::State &state, std::string (*GenerateCaseList)(const std::vector<std::string> &Files)) { @@ -171,6 +191,9 @@ BENCHMARK_CAPTURE(BM_Make_, Mid__, genGlobInMid) BENCHMARK_CAPTURE(BM_Make_, Both_, genGlobAtBothSides) ->RangeMultiplier(MAX_LIST_MUL) ->Range(MAX_LIST_MIN, MAX_LIST_MAX); +BENCHMARK_CAPTURE(BM_Make_, Mix__, genGlobAtBothSidesAndMid) + ->RangeMultiplier(MAX_LIST_MUL) + ->Range(MAX_LIST_MIN, MAX_LIST_MAX); BENCHMARK_CAPTURE(BM_True_, None_, genGlobNone) ->RangeMultiplier(MAX_LIST_MUL) @@ -187,6 +210,9 @@ BENCHMARK_CAPTURE(BM_True_, Mid__, genGlobInMid) BENCHMARK_CAPTURE(BM_True_, Both_, genGlobAtBothSides) ->RangeMultiplier(MAX_LIST_MUL) ->Range(MAX_LIST_MIN, MAX_LIST_MAX); +BENCHMARK_CAPTURE(BM_True_, Mix__, genGlobAtBothSidesAndMid) + ->RangeMultiplier(MAX_LIST_MUL) + ->Range(MAX_LIST_MIN, MAX_LIST_MAX); BENCHMARK_CAPTURE(BM_False, None_, genGlobNone) ->RangeMultiplier(MAX_LIST_MUL) @@ -203,5 +229,8 @@ BENCHMARK_CAPTURE(BM_False, Mid__, genGlobInMid) BENCHMARK_CAPTURE(BM_False, Both_, genGlobAtBothSides) ->RangeMultiplier(MAX_LIST_MUL) ->Range(MAX_LIST_MIN, MAX_LIST_MAX); +BENCHMARK_CAPTURE(BM_False, Mix__, genGlobAtBothSidesAndMid) + ->RangeMultiplier(MAX_LIST_MUL) + ->Range(MAX_LIST_MIN, MAX_LIST_MAX); BENCHMARK_MAIN(); diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h index 1af2761..1fbc41c 100644 --- a/llvm/include/llvm/ADT/Bitfields.h +++ b/llvm/include/llvm/ADT/Bitfields.h @@ -154,12 +154,9 @@ struct ResolveUnderlyingType { using type = std::underlying_type_t<T>; }; template <typename T> struct ResolveUnderlyingType<T, false> { - using type = T; -}; -template <> struct ResolveUnderlyingType<bool, false> { - /// In case sizeof(bool) != 1, replace `void` by an additionnal - /// std::conditional. - using type = std::conditional_t<sizeof(bool) == 1, uint8_t, void>; + static_assert(!std::is_same_v<T, bool> || sizeof(bool) == 1, + "T being bool requires sizeof(bool) == 1."); + using type = std::conditional_t<std::is_same_v<T, bool>, uint8_t, T>; }; } // namespace bitfields_details diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h index 4bda50f..25b5262 100644 --- a/llvm/include/llvm/ADT/DenseMap.h +++ b/llvm/include/llvm/ADT/DenseMap.h @@ -42,7 +42,7 @@ namespace detail { // We extend a pair to allow users to override the bucket type with their own // implementation without requiring two members. template <typename KeyT, typename ValueT> -struct DenseMapPair : public std::pair<KeyT, ValueT> { +struct DenseMapPair : std::pair<KeyT, ValueT> { using std::pair<KeyT, ValueT>::pair; KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; } diff --git a/llvm/include/llvm/ADT/DepthFirstIterator.h b/llvm/include/llvm/ADT/DepthFirstIterator.h index 4ced758..3c54f32 100644 --- a/llvm/include/llvm/ADT/DepthFirstIterator.h +++ b/llvm/include/llvm/ADT/DepthFirstIterator.h @@ -66,8 +66,8 @@ public: // one more method, completed, which is invoked when all children of a // node have been processed. It is intended to distinguish of back and // cross edges in the spanning tree but is not used in the common case. -template <typename NodeRef, unsigned SmallSize=8> -struct df_iterator_default_set : public SmallPtrSet<NodeRef, SmallSize> { +template <typename NodeRef, unsigned SmallSize = 8> +struct df_iterator_default_set : SmallPtrSet<NodeRef, SmallSize> { using BaseSet = SmallPtrSet<NodeRef, SmallSize>; using iterator = typename BaseSet::iterator; @@ -235,8 +235,10 @@ iterator_range<df_iterator<T>> depth_first(const T& G) { } // Provide global definitions of external depth first iterators... -template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>> -struct df_ext_iterator : public df_iterator<T, SetTy, true> { +template <class T, + class SetTy = + df_iterator_default_set<typename GraphTraits<T>::NodeRef>> +struct df_ext_iterator : df_iterator<T, SetTy, true> { df_ext_iterator(const df_iterator<T, SetTy, true> &V) : df_iterator<T, SetTy, true>(V) {} }; @@ -262,7 +264,7 @@ template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>, bool External = false> -struct idf_iterator : public df_iterator<Inverse<T>, SetTy, External> { +struct idf_iterator : df_iterator<Inverse<T>, SetTy, External> { idf_iterator(const df_iterator<Inverse<T>, SetTy, External> &V) : df_iterator<Inverse<T>, SetTy, External>(V) {} }; @@ -284,8 +286,10 @@ iterator_range<idf_iterator<T>> inverse_depth_first(const T& G) { } // Provide global definitions of external inverse depth first iterators... -template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>> -struct idf_ext_iterator : public idf_iterator<T, SetTy, true> { +template <class T, + class SetTy = + df_iterator_default_set<typename GraphTraits<T>::NodeRef>> +struct idf_ext_iterator : idf_iterator<T, SetTy, true> { idf_ext_iterator(const idf_iterator<T, SetTy, true> &V) : idf_iterator<T, SetTy, true>(V) {} idf_ext_iterator(const df_iterator<Inverse<T>, SetTy, true> &V) diff --git a/llvm/include/llvm/ADT/ImmutableSet.h b/llvm/include/llvm/ADT/ImmutableSet.h index 310539f..8b2425e 100644 --- a/llvm/include/llvm/ADT/ImmutableSet.h +++ b/llvm/include/llvm/ADT/ImmutableSet.h @@ -931,8 +931,7 @@ struct ImutProfileInfo<T*> { /// ImutContainerInfo - Generic definition of comparison operations for /// elements of immutable containers that defaults to using /// std::equal_to<> and std::less<> to perform comparison of elements. -template <typename T> -struct ImutContainerInfo : public ImutProfileInfo<T> { +template <typename T> struct ImutContainerInfo : ImutProfileInfo<T> { using value_type = typename ImutProfileInfo<T>::value_type; using value_type_ref = typename ImutProfileInfo<T>::value_type_ref; using key_type = value_type; @@ -957,8 +956,7 @@ struct ImutContainerInfo : public ImutProfileInfo<T> { /// ImutContainerInfo - Specialization for pointer values to treat pointers /// as references to unique objects. Pointers are thus compared by /// their addresses. -template <typename T> -struct ImutContainerInfo<T*> : public ImutProfileInfo<T*> { +template <typename T> struct ImutContainerInfo<T *> : ImutProfileInfo<T *> { using value_type = typename ImutProfileInfo<T*>::value_type; using value_type_ref = typename ImutProfileInfo<T*>::value_type_ref; using key_type = value_type; diff --git a/llvm/include/llvm/ADT/PostOrderIterator.h b/llvm/include/llvm/ADT/PostOrderIterator.h index 1cbd3c1..d9aa452 100644 --- a/llvm/include/llvm/ADT/PostOrderIterator.h +++ b/llvm/include/llvm/ADT/PostOrderIterator.h @@ -200,7 +200,7 @@ template <class T> iterator_range<po_iterator<T>> post_order(const T &G) { // Provide global definitions of external postorder iterators... template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>> -struct po_ext_iterator : public po_iterator<T, SetType, true> { +struct po_ext_iterator : po_iterator<T, SetType, true> { po_ext_iterator(const po_iterator<T, SetType, true> &V) : po_iterator<T, SetType, true>(V) {} }; @@ -223,7 +223,7 @@ iterator_range<po_ext_iterator<T, SetType>> post_order_ext(const T &G, SetType & // Provide global definitions of inverse post order iterators... template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>, bool External = false> -struct ipo_iterator : public po_iterator<Inverse<T>, SetType, External> { +struct ipo_iterator : po_iterator<Inverse<T>, SetType, External> { ipo_iterator(const po_iterator<Inverse<T>, SetType, External> &V) : po_iterator<Inverse<T>, SetType, External> (V) {} }; @@ -245,7 +245,7 @@ iterator_range<ipo_iterator<T>> inverse_post_order(const T &G) { // Provide global definitions of external inverse postorder iterators... template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>> -struct ipo_ext_iterator : public ipo_iterator<T, SetType, true> { +struct ipo_ext_iterator : ipo_iterator<T, SetType, true> { ipo_ext_iterator(const ipo_iterator<T, SetType, true> &V) : ipo_iterator<T, SetType, true>(V) {} ipo_ext_iterator(const po_iterator<Inverse<T>, SetType, true> &V) : diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h index 658f262..a9841c6 100644 --- a/llvm/include/llvm/ADT/STLExtras.h +++ b/llvm/include/llvm/ADT/STLExtras.h @@ -674,7 +674,7 @@ using zip_traits = iterator_facade_base< ReferenceTupleType *, ReferenceTupleType>; template <typename ZipType, typename ReferenceTupleType, typename... Iters> -struct zip_common : public zip_traits<ZipType, ReferenceTupleType, Iters...> { +struct zip_common : zip_traits<ZipType, ReferenceTupleType, Iters...> { using Base = zip_traits<ZipType, ReferenceTupleType, Iters...>; using IndexSequence = std::index_sequence_for<Iters...>; using value_type = typename Base::value_type; diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h index f588a77..8e7c8b3 100644 --- a/llvm/include/llvm/ADT/SmallPtrSet.h +++ b/llvm/include/llvm/ADT/SmallPtrSet.h @@ -532,18 +532,8 @@ class SmallPtrSet : public SmallPtrSetImpl<PtrType> { using BaseT = SmallPtrSetImpl<PtrType>; - // A constexpr version of llvm::bit_ceil. - // TODO: Replace this with std::bit_ceil once C++20 is available. - static constexpr size_t RoundUpToPowerOfTwo(size_t X) { - size_t C = 1; - size_t CMax = C << (std::numeric_limits<size_t>::digits - 1); - while (C < X && C < CMax) - C <<= 1; - return C; - } - // Make sure that SmallSize is a power of two, round up if not. - static constexpr size_t SmallSizePowTwo = RoundUpToPowerOfTwo(SmallSize); + static constexpr size_t SmallSizePowTwo = llvm::bit_ceil_constexpr(SmallSize); /// SmallStorage - Fixed size storage used in 'small mode'. const void *SmallStorage[SmallSizePowTwo]; diff --git a/llvm/include/llvm/ADT/bit.h b/llvm/include/llvm/ADT/bit.h index 8c68d0a..8b60b69 100644 --- a/llvm/include/llvm/ADT/bit.h +++ b/llvm/include/llvm/ADT/bit.h @@ -336,6 +336,21 @@ template <typename T> [[nodiscard]] T bit_ceil(T Value) { return T(1) << llvm::bit_width<T>(Value - 1u); } +/// Returns the smallest integral power of two no smaller than Value if Value is +/// nonzero. Returns 1 otherwise. +/// +/// Ex. bit_ceil(5) == 8. +/// +/// The return value is undefined if the input is larger than the largest power +/// of two representable in T. +template <typename T> [[nodiscard]] constexpr T bit_ceil_constexpr(T Value) { + static_assert(std::is_unsigned_v<T>, + "Only unsigned integral types are allowed."); + if (Value < 2) + return 1; + return T(1) << llvm::bit_width_constexpr<T>(Value - 1u); +} + template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> [[nodiscard]] constexpr T rotl(T V, int R) { constexpr unsigned N = std::numeric_limits<T>::digits; diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h index a4ca54e..f9d7c76 100644 --- a/llvm/include/llvm/Support/Alignment.h +++ b/llvm/include/llvm/Support/Alignment.h @@ -103,7 +103,7 @@ inline Align assumeAligned(uint64_t Value) { /// This struct is a compact representation of a valid (power of two) or /// undefined (0) alignment. -struct MaybeAlign : public std::optional<Align> { +struct MaybeAlign : std::optional<Align> { private: using UP = std::optional<Align>; diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h index 2a9a149..6f6df2e 100644 --- a/llvm/include/llvm/Support/Casting.h +++ b/llvm/include/llvm/Support/Casting.h @@ -340,7 +340,7 @@ struct ValueFromPointerCast /// during the cast. It's also a good example of how to implement a move-only /// cast. template <typename To, typename From, typename Derived = void> -struct UniquePtrCast : public CastIsPossible<To, From *> { +struct UniquePtrCast : CastIsPossible<To, From *> { using Self = detail::SelfType<Derived, UniquePtrCast<To, From>>; using CastResultType = std::unique_ptr< std::remove_reference_t<typename cast_retty<To, From>::ret_type>>; @@ -473,7 +473,7 @@ struct ForwardToPointerCast { // take advantage of the cast traits whenever possible! template <typename To, typename From, typename Enable = void> -struct CastInfo : public CastIsPossible<To, From> { +struct CastInfo : CastIsPossible<To, From> { using Self = CastInfo<To, From, Enable>; using CastReturnType = typename cast_retty<To, From>::ret_type; @@ -536,8 +536,7 @@ struct CastInfo<To, std::unique_ptr<From>> : public UniquePtrCast<To, From> {}; /// the input is std::optional<From> that the output can be std::optional<To>. /// If that's not the case, specialize CastInfo for your use case. template <typename To, typename From> -struct CastInfo<To, std::optional<From>> : public OptionalValueCast<To, From> { -}; +struct CastInfo<To, std::optional<From>> : OptionalValueCast<To, From> {}; /// isa<X> - Return true if the parameter to the template is an instance of one /// of the template type arguments. Used like this: diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h index dd05c53..5a5f00e 100644 --- a/llvm/include/llvm/Support/CommandLine.h +++ b/llvm/include/llvm/Support/CommandLine.h @@ -549,7 +549,7 @@ template <class DataType> struct OptionValue; // The default value safely does nothing. Option value printing is only // best-effort. template <class DataType, bool isClass> -struct OptionValueBase : public GenericOptionValue { +struct OptionValueBase : GenericOptionValue { // Temporary storage for argument passing. using WrapperType = OptionValue<DataType>; diff --git a/llvm/include/llvm/Support/DOTGraphTraits.h b/llvm/include/llvm/Support/DOTGraphTraits.h index bf30aa4..3b9fe00 100644 --- a/llvm/include/llvm/Support/DOTGraphTraits.h +++ b/llvm/include/llvm/Support/DOTGraphTraits.h @@ -162,8 +162,7 @@ public: /// graphs are converted to 'dot' graphs. When specializing, you may inherit /// from DefaultDOTGraphTraits if you don't need to override everything. /// -template <typename Ty> -struct DOTGraphTraits : public DefaultDOTGraphTraits { +template <typename Ty> struct DOTGraphTraits : DefaultDOTGraphTraits { using DefaultDOTGraphTraits::DefaultDOTGraphTraits; }; diff --git a/llvm/include/llvm/Support/ELFAttributes.h b/llvm/include/llvm/Support/ELFAttributes.h index 270246f..5771a84 100644 --- a/llvm/include/llvm/Support/ELFAttributes.h +++ b/llvm/include/llvm/Support/ELFAttributes.h @@ -48,8 +48,6 @@ struct SubsectionAndTagToTagName { StringRef SubsectionName; unsigned Tag; StringRef TagName; - SubsectionAndTagToTagName(StringRef SN, unsigned Tg, StringRef TN) - : SubsectionName(SN), Tag(Tg), TagName(TN) {} }; namespace ELFAttrs { diff --git a/llvm/include/llvm/Support/LSP/Protocol.h b/llvm/include/llvm/Support/LSP/Protocol.h index 93b82f1..e38203a 100644 --- a/llvm/include/llvm/Support/LSP/Protocol.h +++ b/llvm/include/llvm/Support/LSP/Protocol.h @@ -449,7 +449,7 @@ struct ReferenceContext { bool fromJSON(const llvm::json::Value &value, ReferenceContext &result, llvm::json::Path path); -struct ReferenceParams : public TextDocumentPositionParams { +struct ReferenceParams : TextDocumentPositionParams { ReferenceContext context; }; diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h index ed29826..4ba3867 100644 --- a/llvm/include/llvm/Support/MD5.h +++ b/llvm/include/llvm/Support/MD5.h @@ -41,7 +41,7 @@ template <typename T> class ArrayRef; class MD5 { public: - struct MD5Result : public std::array<uint8_t, 16> { + struct MD5Result : std::array<uint8_t, 16> { LLVM_ABI SmallString<32> digest() const; uint64_t low() const { diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h index 40709d4..a4ed712 100644 --- a/llvm/include/llvm/Support/Timer.h +++ b/llvm/include/llvm/Support/Timer.h @@ -167,7 +167,7 @@ public: /// you to declare a new timer, AND specify the region to time, all in one /// statement. All timers with the same name are merged. This is primarily /// used for debugging and for hunting performance problems. -struct NamedRegionTimer : public TimeRegion { +struct NamedRegionTimer : TimeRegion { LLVM_ABI explicit NamedRegionTimer(StringRef Name, StringRef Description, StringRef GroupName, StringRef GroupDescription, diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 979a8b0..4b22c68 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -21,6 +21,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/RegisterPressure.h" #include <algorithm> +#include <array> namespace llvm { @@ -45,7 +46,7 @@ struct GCNRegPressure { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR]; } - void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } + void clear() { Value.fill(0); } unsigned getNumRegs(RegKind Kind) const { assert(Kind < TOTAL_KINDS); @@ -127,9 +128,7 @@ struct GCNRegPressure { bool less(const MachineFunction &MF, const GCNRegPressure &O, unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const; - bool operator==(const GCNRegPressure &O) const { - return std::equal(&Value[0], &Value[ValueArraySize], O.Value); - } + bool operator==(const GCNRegPressure &O) const { return Value == O.Value; } bool operator!=(const GCNRegPressure &O) const { return !(*this == O); @@ -160,7 +159,7 @@ private: /// Pressure for all register kinds (first all regular registers kinds, then /// all tuple register kinds). - unsigned Value[ValueArraySize]; + std::array<unsigned, ValueArraySize> Value; static unsigned getRegKind(const TargetRegisterClass *RC, const SIRegisterInfo *STI); diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c9..09ef6ac 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -45,6 +45,9 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom); + // 32-bit ABS is legal for AMDGPU except for R600 + setOperationAction(ISD::ABS, MVT::i32, Expand); + // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address // spaces, so it is custom lowered to handle those where it isn't. for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 942e784..50447f4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10626,59 +10626,6 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) return false; - const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI, - this]() -> bool { - if (CmpValue != 0) - return false; - - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) - return false; - - bool CanOptimize = false; - - // For S_OP that set SCC = DST!=0, do the transformation - // - // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...) - if (setsSCCifResultIsNonZero(*Def)) - CanOptimize = true; - - // s_cmp_lg_* is redundant because the SCC input value for S_CSELECT* has - // the same value that will be calculated by s_cmp_lg_* - // - // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero - // imm), 0) - if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 || - Def->getOpcode() == AMDGPU::S_CSELECT_B64) { - bool Op1IsNonZeroImm = - Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0; - bool Op2IsZeroImm = - Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0; - if (Op1IsNonZeroImm && Op2IsZeroImm) - CanOptimize = true; - } - - if (!CanOptimize) - return false; - - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) - return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; - } - - if (MachineOperand *SccDef = - Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) - SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); - CmpInstr.eraseFromParent(); - return true; - }; - const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, this](int64_t ExpectedValue, unsigned SrcSize, bool IsReversible, bool IsSigned) -> bool { @@ -10753,20 +10700,16 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - MachineInstr *KillsSCC = nullptr; - for (MachineInstr &MI : - make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) { - if (MI.modifiesRegister(AMDGPU::SCC, &RI)) + for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator(); + I != E; ++I) { + if (I->modifiesRegister(AMDGPU::SCC, &RI) || + I->killsRegister(AMDGPU::SCC, &RI)) return false; - if (MI.killsRegister(AMDGPU::SCC, &RI)) - KillsSCC = &MI; } MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr); SccDef->setIsDead(false); - if (KillsSCC) - KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr); CmpInstr.eraseFromParent(); if (!MRI->use_nodbg_empty(DefReg)) { @@ -10810,7 +10753,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMP_LG_I32: case AMDGPU::S_CMPK_LG_U32: case AMDGPU::S_CMPK_LG_I32: - return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect(); + return optimizeCmpAnd(0, 32, true, false); case AMDGPU::S_CMP_GT_U32: case AMDGPU::S_CMPK_GT_U32: return optimizeCmpAnd(0, 32, false, false); @@ -10818,7 +10761,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMPK_GT_I32: return optimizeCmpAnd(0, 32, false, true); case AMDGPU::S_CMP_LG_U64: - return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect(); + return optimizeCmpAnd(0, 64, true, false); } return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index ee99a74..df27ec1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -709,30 +709,6 @@ public: } } - static bool setsSCCifResultIsNonZero(const MachineInstr &MI) { - if (!MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) - return false; - // Compares have no result - if (MI.isCompare()) - return false; - switch (MI.getOpcode()) { - default: - return true; - case AMDGPU::S_ADD_I32: - case AMDGPU::S_ADD_U32: - case AMDGPU::S_ADDC_U32: - case AMDGPU::S_SUB_I32: - case AMDGPU::S_SUB_U32: - case AMDGPU::S_SUBB_U32: - case AMDGPU::S_MIN_I32: - case AMDGPU::S_MIN_U32: - case AMDGPU::S_MAX_I32: - case AMDGPU::S_MAX_U32: - case AMDGPU::S_ADDK_I32: - return false; - } - } - static bool isEXP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::EXP; } diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index b80c3c9..4947d03 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" @@ -760,6 +761,7 @@ private: void handleCallArguments(CallBase &CB); void handleExtractOfWithOverflow(ExtractValueInst &EVI, const WithOverflowInst *WO, unsigned Idx); + bool isInstFullyOverDefined(Instruction &Inst); private: friend class InstVisitor<SCCPInstVisitor>; @@ -1374,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const { // 7. If a conditional branch has a value that is overdefined, make all // successors executable. void SCCPInstVisitor::visitPHINode(PHINode &PN) { - // If this PN returns a struct, just mark the result overdefined. - // TODO: We could do a lot better than this if code actually uses this. - if (PN.getType()->isStructTy()) - return (void)markOverdefined(&PN); - - if (getValueState(&PN).isOverdefined()) - return; // Quick exit - // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, // and slow us down a lot. Just mark them overdefined. if (PN.getNumIncomingValues() > 64) return (void)markOverdefined(&PN); - unsigned NumActiveIncoming = 0; + if (isInstFullyOverDefined(PN)) + return; + SmallVector<unsigned> FeasibleIncomingIndices; + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { + if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) + continue; + FeasibleIncomingIndices.push_back(i); + } // Look at all of the executable operands of the PHI node. If any of them // are overdefined, the PHI becomes overdefined as well. If they are all // constant, and they agree with each other, the PHI becomes the identical // constant. If they are constant and don't agree, the PHI is a constant // range. If there are no executable operands, the PHI remains unknown. - ValueLatticeElement PhiState = getValueState(&PN); - for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { - if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) - continue; - - const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i)); - PhiState.mergeIn(IV); - NumActiveIncoming++; - if (PhiState.isOverdefined()) - break; + if (StructType *STy = dyn_cast<StructType>(PN.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + ValueLatticeElement PhiState = getStructValueState(&PN, i); + if (PhiState.isOverdefined()) + continue; + for (unsigned j : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = + getStructValueState(PN.getIncomingValue(j), i); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i); + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); + } + } else { + ValueLatticeElement PhiState = getValueState(&PN); + for (unsigned i : FeasibleIncomingIndices) { + const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i)); + PhiState.mergeIn(IV); + if (PhiState.isOverdefined()) + break; + } + // We allow up to 1 range extension per active incoming value and one + // additional extension. Note that we manually adjust the number of range + // extensions to match the number of active incoming values. This helps to + // limit multiple extensions caused by the same incoming value, if other + // incoming values are equal. + ValueLatticeElement &PhiStateRef = ValueState[&PN]; + mergeInValue(PhiStateRef, &PN, PhiState, + ValueLatticeElement::MergeOptions().setMaxWidenSteps( + FeasibleIncomingIndices.size() + 1)); + PhiStateRef.setNumRangeExtensions( + std::max((unsigned)FeasibleIncomingIndices.size(), + PhiStateRef.getNumRangeExtensions())); } - - // We allow up to 1 range extension per active incoming value and one - // additional extension. Note that we manually adjust the number of range - // extensions to match the number of active incoming values. This helps to - // limit multiple extensions caused by the same incoming value, if other - // incoming values are equal. - ValueLatticeElement &PhiStateRef = ValueState[&PN]; - mergeInValue(PhiStateRef, &PN, PhiState, - ValueLatticeElement::MergeOptions().setMaxWidenSteps( - NumActiveIncoming + 1)); - PhiStateRef.setNumRangeExtensions( - std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions())); } void SCCPInstVisitor::visitReturnInst(ReturnInst &I) { @@ -2127,6 +2146,21 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { } } +bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) { + // For structure Type, we handle each member separately. + // A structure object won't be considered as overdefined when + // there is at least one member that is not overdefined. + if (StructType *STy = dyn_cast<StructType>(Inst.getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) { + if (!getStructValueState(&Inst, i).isOverdefined()) + return false; + } + return true; + } + + return getValueState(&Inst).isOverdefined(); +} + void SCCPInstVisitor::solve() { // Process the work lists until they are empty! while (!BBWorkList.empty() || !InstWorkList.empty()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0e0b042..84d2ea6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -407,6 +407,10 @@ public: VPBasicBlock *getParent() { return Parent; } const VPBasicBlock *getParent() const { return Parent; } + /// \return the VPRegionBlock which the recipe belongs to. + VPRegionBlock *getRegion(); + const VPRegionBlock *getRegion() const; + /// The method which generates the output IR instructions that correspond to /// this VPRecipe, thereby "executing" the VPlan. virtual void execute(VPTransformState &State) = 0; @@ -4075,6 +4079,14 @@ public: } }; +inline VPRegionBlock *VPRecipeBase::getRegion() { + return getParent()->getParent(); +} + +inline const VPRegionBlock *VPRecipeBase::getRegion() const { + return getParent()->getParent(); +} + /// VPlan models a candidate for vectorization, encoding various decisions take /// to produce efficient output IR, including which branches, basic-blocks and /// output IR instructions to generate, and their cost. VPlan holds a diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index f413c63..7e074c1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -377,7 +377,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, #ifndef NDEBUG auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { - auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); + VPRegionBlock *Region = R->getRegion(); if (Region && Region->isReplicator()) { assert(Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && "Expected SESE region!"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7a98c75..d1e67e6b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2352,7 +2352,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *CanIV = getParent()->getParent()->getCanonicalIV(); + auto *CanIV = getRegion()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && getScalarType() == CanIV->getScalarType(); } @@ -3076,7 +3076,7 @@ static void scalarizeInstruction(const Instruction *Instr, State.AC->registerAssumption(II); assert( - (RepRecipe->getParent()->getParent() || + (RepRecipe->getRegion() || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || all_of(RepRecipe->operands(), [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && @@ -3268,7 +3268,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, to_vector(operands()), VF); // If the recipe is not predicated (i.e. not in a replicate region), return // the scalar cost. Otherwise handle predicated cost. - if (!getParent()->getParent()->isReplicator()) + if (!getRegion()->isReplicator()) return ScalarCost; // Account for the phi nodes that we will create. @@ -3284,7 +3284,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, case Instruction::Store: { // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - const VPRegionBlock *ParentRegion = getParent()->getParent(); + const VPRegionBlock *ParentRegion = getRegion(); if (ParentRegion && ParentRegion->isReplicator()) break; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index cae9aee8..f5f616f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1858,8 +1858,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, return nullptr; VPRegionBlock *EnclosingLoopRegion = HoistCandidate->getParent()->getEnclosingLoopRegion(); - assert((!HoistCandidate->getParent()->getParent() || - HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) && + assert((!HoistCandidate->getRegion() || + HoistCandidate->getRegion() == EnclosingLoopRegion) && "CFG in VPlan should still be flat, without replicate regions"); // Hoist candidate was already visited, no need to hoist. if (!Visited.insert(HoistCandidate).second) @@ -2898,7 +2898,7 @@ void VPlanTransforms::replaceSymbolicStrides( // evolution. auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { auto *R = cast<VPRecipeBase>(&U); - return R->getParent()->getParent() || + return R->getRegion() || R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor(); }; ValueToSCEVMapTy RewriteMap; @@ -3803,8 +3803,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { continue; auto *DefR = cast<VPRecipeWithIRFlags>(&R); auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) { - VPRegionBlock *ParentRegion = - cast<VPRecipeBase>(U)->getParent()->getParent(); + VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion(); return !U->usesScalars(DefR) || ParentRegion != LoopRegion; }; if ((isa<VPReplicateRecipe>(DefR) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index cf95ac0..9a2497e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) { return true; if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) { - const VPRegionBlock *RegionOfR = Rep->getParent()->getParent(); + const VPRegionBlock *RegionOfR = Rep->getRegion(); // Don't consider recipes in replicate regions as uniform yet; their first // lane cannot be accessed when executing the replicate region for other // lanes. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index 7714c03..5171403 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -140,6 +140,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -344,6 +345,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll index 7b81669..7b01f13 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -143,6 +143,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -347,6 +348,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 diff --git a/llvm/test/CodeGen/AMDGPU/abs_i32.ll b/llvm/test/CodeGen/AMDGPU/abs_i32.ll new file mode 100644 index 0000000..b53047f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/abs_i32.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600 %s + +define amdgpu_kernel void @abs_v1(ptr addrspace(1) %out, i32 %arg) { +; GFX9-LABEL: abs_v1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: abs_v1: +; R600: ; %bb.0: +; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, KC0[2].Z, +; R600-NEXT: SUB_INT * T1.W, 0.0, PV.W, +; R600-NEXT: MAX_INT T0.X, T0.W, PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %res = call i32 @llvm.abs.i32(i32 %arg, i1 false) + store i32 %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @abs_v2(ptr addrspace(1) %out, i32 %arg) { +; GFX9-LABEL: abs_v2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: abs_v2: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[2].Z, +; R600-NEXT: MAX_INT T0.X, KC0[2].Z, PV.W, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %neg = sub i32 0, %arg + %cond = icmp sgt i32 %arg, %neg + %res = select i1 %cond, i32 %arg, i32 %neg + store i32 %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @abs_v3(ptr addrspace(1) %out, i32 %arg) { +; GFX9-LABEL: abs_v3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: abs_v3: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SUB_INT * T0.W, 0.0, KC0[2].Z, +; R600-NEXT: MAX_INT T0.X, PV.W, KC0[2].Z, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %neg = sub i32 0, %arg + %cond = icmp sgt i32 %neg, %arg + %res = select i1 %cond, i32 %neg, i32 %arg + store i32 %res, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll index 8088c1b..b72eba8 100644 --- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll +++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll @@ -180,7 +180,11 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B ; CHECK-LABEL: s_add64_32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, s2 +; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_addc_u32 s2, s4, 0 ; CHECK-NEXT: ; return to shader part epilog %sum64 = add i64 %val64A, %val64B @@ -195,10 +199,14 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_uadd_v2i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s6, s2, s6 -; CHECK-NEXT: s_addc_u32 s7, s3, s7 +; CHECK-NEXT: s_add_u32 s10, s2, s6 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 +; CHECK-NEXT: s_addc_u32 s8, s3, s7 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s4 +; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -207,8 +215,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, s6 -; CHECK-NEXT: v_mov_b32_e32 v5, s7 +; CHECK-NEXT: v_mov_b32_e32 v4, s10 +; CHECK-NEXT: v_mov_b32_e32 v5, s8 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s3, s2 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -225,10 +233,14 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_v2i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_sub_u32 s6, s2, s6 -; CHECK-NEXT: s_subb_u32 s7, s3, s7 +; CHECK-NEXT: s_sub_u32 s10, s2, s6 +; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0 +; CHECK-NEXT: s_subb_u32 s8, s3, s7 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_sub_u32 s0, s0, s4 +; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -237,8 +249,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v7 ; CHECK-NEXT: v_readfirstlane_b32 s2, v6 -; CHECK-NEXT: v_mov_b32_e32 v4, s6 -; CHECK-NEXT: v_mov_b32_e32 v5, s7 +; CHECK-NEXT: v_mov_b32_e32 v4, s10 +; CHECK-NEXT: v_mov_b32_e32 v5, s8 ; CHECK-NEXT: s_mov_b32 s1, s0 ; CHECK-NEXT: s_mov_b32 s3, s2 ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -256,6 +268,8 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) ; CHECK-LABEL: s_uadd_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, s2 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -278,6 +292,8 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_uadd_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -323,6 +339,8 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -345,6 +363,8 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) { ; CHECK-LABEL: s_usub_n1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, -1 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s1, s1, -1 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 51df8c3..948811e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7821,9 +7821,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s0, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -7854,6 +7855,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s15, s16, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s12 ; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s12 @@ -7879,50 +7881,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 ; GFX6-NEXT: s_addc_u32 s4, s4, 0 ; GFX6-NEXT: s_mul_i32 s14, s7, s14 -; GFX6-NEXT: s_add_u32 s16, s1, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_add_u32 s14, s1, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX6-NEXT: s_addc_u32 s17, 0, s4 +; GFX6-NEXT: s_addc_u32 s15, 0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_mul_i32 s4, s10, s17 +; GFX6-NEXT: s_mul_i32 s4, s10, s15 ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 -; GFX6-NEXT: s_mul_i32 s5, s11, s16 -; GFX6-NEXT: s_add_i32 s18, s4, s5 -; GFX6-NEXT: s_sub_i32 s14, s7, s18 -; GFX6-NEXT: s_mul_i32 s4, s10, s16 +; GFX6-NEXT: s_mul_i32 s5, s11, s14 +; GFX6-NEXT: s_add_i32 s16, s4, s5 +; GFX6-NEXT: s_sub_i32 s17, s7, s16 +; GFX6-NEXT: s_mul_i32 s4, s10, s14 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s4, s5 -; GFX6-NEXT: s_subb_u32 s19, s14, s11 -; GFX6-NEXT: s_sub_u32 s20, s6, s10 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s19, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s11 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s20, s10 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s11 -; GFX6-NEXT: s_cselect_b32 s14, s19, s15 -; GFX6-NEXT: s_add_u32 s15, s16, 1 -; GFX6-NEXT: s_addc_u32 s19, s17, 0 -; GFX6-NEXT: s_add_u32 s20, s16, 2 -; GFX6-NEXT: s_addc_u32 s21, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s20, s15 -; GFX6-NEXT: s_cselect_b32 s15, s21, s19 +; GFX6-NEXT: s_or_b32 s18, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_subb_u32 s17, s17, s11 +; GFX6-NEXT: s_sub_u32 s19, s6, s10 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_subb_u32 s4, s7, s18 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s4, s17, 0 ; GFX6-NEXT: s_cmp_ge_u32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s6, s10 -; GFX6-NEXT: s_cselect_b32 s6, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s10 +; GFX6-NEXT: s_cselect_b32 s17, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s4, s11 -; GFX6-NEXT: s_cselect_b32 s4, s6, s5 +; GFX6-NEXT: s_cselect_b32 s4, s17, s5 +; GFX6-NEXT: s_add_u32 s5, s14, 1 +; GFX6-NEXT: s_addc_u32 s17, s15, 0 +; GFX6-NEXT: s_add_u32 s19, s14, 2 +; GFX6-NEXT: s_addc_u32 s20, s15, 0 ; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s5, s15, s17 -; GFX6-NEXT: s_cselect_b32 s4, s14, s16 +; GFX6-NEXT: s_cselect_b32 s4, s19, s5 +; GFX6-NEXT: s_cselect_b32 s5, s20, s17 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_subb_u32 s7, s7, s16 +; GFX6-NEXT: s_cmp_ge_u32 s7, s11 +; GFX6-NEXT: s_cselect_b32 s16, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s6, s10 +; GFX6-NEXT: s_cselect_b32 s6, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s7, s11 +; GFX6-NEXT: s_cselect_b32 s6, s6, s16 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cselect_b32 s5, s5, s15 +; GFX6-NEXT: s_cselect_b32 s4, s4, s14 ; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 @@ -7945,8 +7949,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s4, 0, s8 -; GFX9-NEXT: s_subb_u32 s5, 0, s9 +; GFX9-NEXT: s_sub_u32 s10, 0, s8 +; GFX9-NEXT: s_subb_u32 s11, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7956,52 +7960,56 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s10, v2 -; GFX9-NEXT: v_readfirstlane_b32 s11, v1 -; GFX9-NEXT: s_mul_i32 s12, s4, s10 -; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11 -; GFX9-NEXT: s_mul_i32 s13, s5, s11 -; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s4, s11 -; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 -; GFX9-NEXT: s_mul_i32 s16, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 +; GFX9-NEXT: v_readfirstlane_b32 s12, v2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mul_i32 s5, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4 +; GFX9-NEXT: s_mul_i32 s13, s11, s4 +; GFX9-NEXT: s_add_i32 s5, s14, s5 +; GFX9-NEXT: s_mul_i32 s15, s10, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15 +; GFX9-NEXT: s_mul_i32 s16, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5 ; GFX9-NEXT: s_add_u32 s14, s14, s16 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 -; GFX9-NEXT: s_mul_i32 s15, s10, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_mul_i32 s15, s12, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s5, s12, s5 +; GFX9-NEXT: s_add_u32 s5, s13, s5 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s11, s11, s12 -; GFX9-NEXT: s_addc_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_i32 s12, s4, s10 -; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s5, s5, s11 -; GFX9-NEXT: s_add_i32 s12, s12, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s11 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 -; GFX9-NEXT: s_mul_i32 s14, s10, s4 -; GFX9-NEXT: s_mul_i32 s16, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4 -; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 -; GFX9-NEXT: s_add_u32 s4, s4, s16 +; GFX9-NEXT: s_add_u32 s14, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s4, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s11, s11, s14 +; GFX9-NEXT: s_add_i32 s4, s4, s11 +; GFX9-NEXT: s_mul_i32 s10, s10, s14 +; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s12, s10 +; GFX9-NEXT: s_mul_i32 s16, s14, s4 +; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 +; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 +; GFX9-NEXT: s_add_u32 s10, s10, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s4, s4, s14 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12 -; GFX9-NEXT: s_addc_u32 s4, s15, s13 +; GFX9-NEXT: s_add_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 +; GFX9-NEXT: s_addc_u32 s10, s15, s11 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s4, s4, s12 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_add_u32 s11, s11, s4 -; GFX9-NEXT: s_addc_u32 s10, s10, s5 +; GFX9-NEXT: s_mul_i32 s4, s12, s4 +; GFX9-NEXT: s_add_u32 s4, s10, s4 +; GFX9-NEXT: s_addc_u32 s10, 0, s5 +; GFX9-NEXT: s_add_u32 s11, s14, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s10, s12, s10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -8020,35 +8028,38 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_addc_u32 s11, s12, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 ; GFX9-NEXT: s_mul_i32 s10, s3, s10 -; GFX9-NEXT: s_add_u32 s13, s11, s10 -; GFX9-NEXT: s_addc_u32 s12, 0, s12 -; GFX9-NEXT: s_mul_i32 s10, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13 +; GFX9-NEXT: s_add_u32 s14, s11, s10 +; GFX9-NEXT: s_addc_u32 s15, 0, s12 +; GFX9-NEXT: s_mul_i32 s10, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s9, s13 -; GFX9-NEXT: s_add_i32 s14, s10, s11 -; GFX9-NEXT: s_sub_i32 s15, s3, s14 -; GFX9-NEXT: s_mul_i32 s10, s8, s13 +; GFX9-NEXT: s_mul_i32 s11, s9, s14 +; GFX9-NEXT: s_add_i32 s16, s10, s11 +; GFX9-NEXT: s_sub_i32 s12, s3, s16 +; GFX9-NEXT: s_mul_i32 s10, s8, s14 ; GFX9-NEXT: s_sub_u32 s2, s2, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_subb_u32 s15, s15, s9 -; GFX9-NEXT: s_sub_u32 s16, s2, s8 -; GFX9-NEXT: s_subb_u32 s15, s15, 0 -; GFX9-NEXT: s_cmp_ge_u32 s15, s9 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s17, s12, s9 +; GFX9-NEXT: s_sub_u32 s18, s2, s8 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_subb_u32 s12, s17, 0 +; GFX9-NEXT: s_cmp_ge_u32 s12, s9 +; GFX9-NEXT: s_cselect_b32 s13, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s18, s8 ; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s16, s8 -; GFX9-NEXT: s_cselect_b32 s16, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s15, s9 -; GFX9-NEXT: s_cselect_b32 s15, s16, s17 -; GFX9-NEXT: s_add_u32 s16, s13, 1 -; GFX9-NEXT: s_addc_u32 s17, s12, 0 -; GFX9-NEXT: s_add_u32 s18, s13, 2 -; GFX9-NEXT: s_addc_u32 s19, s12, 0 -; GFX9-NEXT: s_cmp_lg_u32 s15, 0 -; GFX9-NEXT: s_cselect_b32 s15, s18, s16 -; GFX9-NEXT: s_cselect_b32 s16, s19, s17 +; GFX9-NEXT: s_cmp_eq_u32 s12, s9 +; GFX9-NEXT: s_cselect_b32 s12, s17, s13 +; GFX9-NEXT: s_add_u32 s13, s14, 1 +; GFX9-NEXT: s_addc_u32 s17, s15, 0 +; GFX9-NEXT: s_add_u32 s18, s14, 2 +; GFX9-NEXT: s_addc_u32 s19, s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b32 s12, s18, s13 +; GFX9-NEXT: s_cselect_b32 s13, s19, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s14 +; GFX9-NEXT: s_subb_u32 s3, s3, s16 ; GFX9-NEXT: s_cmp_ge_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s2, s8 @@ -8056,8 +8067,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s3, s9 ; GFX9-NEXT: s_cselect_b32 s2, s2, s10 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s3, s16, s12 -; GFX9-NEXT: s_cselect_b32 s2, s15, s13 +; GFX9-NEXT: s_cselect_b32 s3, s13, s15 +; GFX9-NEXT: s_cselect_b32 s2, s12, s14 ; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: s_sub_u32 s2, s2, s4 @@ -8317,9 +8328,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s17, 0, s18 ; GFX6-NEXT: s_add_u32 s18, s12, s13 ; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s16, s16, s17 ; GFX6-NEXT: s_mul_i32 s12, s14, s16 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 @@ -8350,6 +8362,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s15, s18, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_addc_u32 s14, s16, s14 ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s12 @@ -8374,53 +8387,55 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_addc_u32 s16, s16, 0 ; GFX6-NEXT: s_mul_i32 s14, s9, s14 -; GFX6-NEXT: s_add_u32 s18, s15, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: s_add_u32 s17, s15, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s17 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: s_addc_u32 s19, 0, s16 -; GFX6-NEXT: s_mul_i32 s14, s6, s19 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_mul_i32 s14, s6, s16 ; GFX6-NEXT: v_readfirstlane_b32 s15, v0 ; GFX6-NEXT: s_add_i32 s14, s15, s14 -; GFX6-NEXT: s_mul_i32 s15, s7, s18 -; GFX6-NEXT: s_add_i32 s20, s14, s15 -; GFX6-NEXT: s_sub_i32 s16, s9, s20 -; GFX6-NEXT: s_mul_i32 s14, s6, s18 +; GFX6-NEXT: s_mul_i32 s15, s7, s17 +; GFX6-NEXT: s_add_i32 s18, s14, s15 +; GFX6-NEXT: s_sub_i32 s19, s9, s18 +; GFX6-NEXT: s_mul_i32 s14, s6, s17 ; GFX6-NEXT: s_sub_u32 s8, s8, s14 ; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s17, s14, s15 -; GFX6-NEXT: s_subb_u32 s21, s16, s7 -; GFX6-NEXT: s_sub_u32 s22, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GFX6-NEXT: s_or_b32 s16, s16, s17 -; GFX6-NEXT: s_subb_u32 s16, s21, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s7 -; GFX6-NEXT: s_cselect_b32 s17, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s22, s6 -; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, s7 -; GFX6-NEXT: s_cselect_b32 s16, s21, s17 -; GFX6-NEXT: s_add_u32 s17, s18, 1 -; GFX6-NEXT: s_addc_u32 s21, s19, 0 -; GFX6-NEXT: s_add_u32 s22, s18, 2 -; GFX6-NEXT: s_addc_u32 s23, s19, 0 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_cselect_b32 s16, s22, s17 -; GFX6-NEXT: s_cselect_b32 s17, s23, s21 +; GFX6-NEXT: s_or_b32 s20, s14, s15 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s19, s19, s7 +; GFX6-NEXT: s_sub_u32 s21, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s9, s9, s20 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_subb_u32 s14, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s7 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s21, s6 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s7 +; GFX6-NEXT: s_cselect_b32 s14, s19, s15 +; GFX6-NEXT: s_add_u32 s15, s17, 1 +; GFX6-NEXT: s_addc_u32 s19, s16, 0 +; GFX6-NEXT: s_add_u32 s21, s17, 2 +; GFX6-NEXT: s_addc_u32 s22, s16, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s21, s15 +; GFX6-NEXT: s_cselect_b32 s15, s22, s19 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s9, s9, s18 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s14 +; GFX6-NEXT: s_cselect_b32 s6, s6, s18 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s17, s19 -; GFX6-NEXT: s_cselect_b32 s6, s16, s18 +; GFX6-NEXT: s_cselect_b32 s7, s15, s16 +; GFX6-NEXT: s_cselect_b32 s6, s14, s17 ; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX6-NEXT: s_sub_u32 s16, s6, s2 -; GFX6-NEXT: s_subb_u32 s17, s7, s3 +; GFX6-NEXT: s_sub_u32 s14, s6, s2 +; GFX6-NEXT: s_subb_u32 s15, s7, s3 ; GFX6-NEXT: s_ashr_i32 s6, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s6 ; GFX6-NEXT: s_mov_b32 s7, s6 @@ -8439,39 +8454,40 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s12, s14 +; GFX6-NEXT: s_mul_i32 s1, s12, s16 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s13, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s15, s12, s2 +; GFX6-NEXT: s_mul_i32 s17, s12, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s18, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s15, s14, s15 +; GFX6-NEXT: s_mul_i32 s17, s16, s17 ; GFX6-NEXT: v_readfirstlane_b32 s18, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s15 +; GFX6-NEXT: s_add_u32 s4, s4, s17 ; GFX6-NEXT: s_addc_u32 s4, s5, s18 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s14, s3 +; GFX6-NEXT: s_mul_i32 s3, s16, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s4, s14, s4 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s4, s16, s4 ; GFX6-NEXT: s_mul_i32 s2, s12, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -8485,14 +8501,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s13, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v2 -; GFX6-NEXT: s_add_u32 s13, s15, s13 -; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 ; GFX6-NEXT: v_readfirstlane_b32 s12, v3 ; GFX6-NEXT: s_add_u32 s3, s13, s3 -; GFX6-NEXT: s_addc_u32 s3, s14, s12 +; GFX6-NEXT: s_addc_u32 s3, s16, s12 ; GFX6-NEXT: v_readfirstlane_b32 s12, v1 ; GFX6-NEXT: s_addc_u32 s12, s12, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 @@ -8501,6 +8517,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s12, s4, s12 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 @@ -8512,70 +8529,72 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX6-NEXT: s_mul_i32 s2, s10, s12 -; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2 -; GFX6-NEXT: v_readfirstlane_b32 s15, v3 +; GFX6-NEXT: v_readfirstlane_b32 s17, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX6-NEXT: s_add_u32 s2, s15, s2 -; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: s_add_u32 s2, s17, s2 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 ; GFX6-NEXT: s_mul_i32 s13, s11, s13 -; GFX6-NEXT: v_readfirstlane_b32 s15, v1 +; GFX6-NEXT: v_readfirstlane_b32 s17, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s13 -; GFX6-NEXT: s_addc_u32 s2, s14, s15 +; GFX6-NEXT: s_addc_u32 s2, s16, s17 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 ; GFX6-NEXT: s_mul_i32 s12, s11, s12 -; GFX6-NEXT: s_add_u32 s18, s2, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: s_add_u32 s16, s2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: s_addc_u32 s19, 0, s13 -; GFX6-NEXT: s_mul_i32 s12, s8, s19 +; GFX6-NEXT: s_addc_u32 s17, 0, s13 +; GFX6-NEXT: s_mul_i32 s12, s8, s17 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_readfirstlane_b32 s13, v0 ; GFX6-NEXT: s_add_i32 s12, s13, s12 -; GFX6-NEXT: s_mul_i32 s13, s9, s18 -; GFX6-NEXT: s_add_i32 s20, s12, s13 -; GFX6-NEXT: s_sub_i32 s14, s11, s20 -; GFX6-NEXT: s_mul_i32 s12, s8, s18 +; GFX6-NEXT: s_mul_i32 s13, s9, s16 +; GFX6-NEXT: s_add_i32 s18, s12, s13 +; GFX6-NEXT: s_sub_i32 s19, s11, s18 +; GFX6-NEXT: s_mul_i32 s12, s8, s16 ; GFX6-NEXT: s_sub_u32 s10, s10, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s12, s13 -; GFX6-NEXT: s_subb_u32 s21, s14, s9 -; GFX6-NEXT: s_sub_u32 s22, s10, s8 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s21, 0 -; GFX6-NEXT: s_cmp_ge_u32 s14, s9 -; GFX6-NEXT: s_cselect_b32 s15, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s22, s8 -; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s14, s9 -; GFX6-NEXT: s_cselect_b32 s14, s21, s15 -; GFX6-NEXT: s_add_u32 s15, s18, 1 -; GFX6-NEXT: s_addc_u32 s21, s19, 0 -; GFX6-NEXT: s_add_u32 s22, s18, 2 -; GFX6-NEXT: s_addc_u32 s23, s19, 0 -; GFX6-NEXT: s_cmp_lg_u32 s14, 0 -; GFX6-NEXT: s_cselect_b32 s14, s22, s15 -; GFX6-NEXT: s_cselect_b32 s15, s23, s21 +; GFX6-NEXT: s_or_b32 s20, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s19, s19, s9 +; GFX6-NEXT: s_sub_u32 s21, s10, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s11, s11, s20 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s12, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s12, s9 +; GFX6-NEXT: s_cselect_b32 s13, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s21, s8 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, s9 +; GFX6-NEXT: s_cselect_b32 s12, s19, s13 +; GFX6-NEXT: s_add_u32 s13, s16, 1 +; GFX6-NEXT: s_addc_u32 s19, s17, 0 +; GFX6-NEXT: s_add_u32 s21, s16, 2 +; GFX6-NEXT: s_addc_u32 s22, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b32 s12, s21, s13 +; GFX6-NEXT: s_cselect_b32 s13, s22, s19 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s11, s11, s18 ; GFX6-NEXT: s_cmp_ge_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s12, -1, 0 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s11, s9 -; GFX6-NEXT: s_cselect_b32 s8, s8, s12 +; GFX6-NEXT: s_cselect_b32 s8, s8, s18 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s9, s15, s19 -; GFX6-NEXT: s_cselect_b32 s8, s14, s18 +; GFX6-NEXT: s_cselect_b32 s9, s13, s17 +; GFX6-NEXT: s_cselect_b32 s8, s12, s16 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GFX6-NEXT: s_sub_u32 s4, s6, s4 ; GFX6-NEXT: s_subb_u32 s5, s7, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8595,8 +8614,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s12, 0, s6 -; GFX9-NEXT: s_subb_u32 s13, 0, s7 +; GFX9-NEXT: s_sub_u32 s14, 0, s6 +; GFX9-NEXT: s_subb_u32 s15, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8605,52 +8624,56 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s14, v1 -; GFX9-NEXT: v_readfirstlane_b32 s15, v0 -; GFX9-NEXT: s_mul_i32 s16, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15 -; GFX9-NEXT: s_mul_i32 s17, s13, s15 -; GFX9-NEXT: s_add_i32 s16, s18, s16 -; GFX9-NEXT: s_mul_i32 s19, s12, s15 -; GFX9-NEXT: s_add_i32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19 -; GFX9-NEXT: s_mul_i32 s20, s15, s16 -; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16 +; GFX9-NEXT: v_readfirstlane_b32 s16, v1 +; GFX9-NEXT: v_readfirstlane_b32 s12, v0 +; GFX9-NEXT: s_mul_i32 s13, s14, s16 +; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12 +; GFX9-NEXT: s_mul_i32 s17, s15, s12 +; GFX9-NEXT: s_add_i32 s13, s18, s13 +; GFX9-NEXT: s_mul_i32 s19, s14, s12 +; GFX9-NEXT: s_add_i32 s13, s13, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19 +; GFX9-NEXT: s_mul_i32 s20, s12, s13 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13 ; GFX9-NEXT: s_add_u32 s18, s18, s20 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19 -; GFX9-NEXT: s_mul_i32 s19, s14, s19 +; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19 +; GFX9-NEXT: s_mul_i32 s19, s16, s19 ; GFX9-NEXT: s_add_u32 s18, s18, s19 -; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16 +; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13 ; GFX9-NEXT: s_addc_u32 s17, s17, s20 ; GFX9-NEXT: s_addc_u32 s18, s21, 0 -; GFX9-NEXT: s_mul_i32 s16, s14, s16 -; GFX9-NEXT: s_add_u32 s16, s17, s16 +; GFX9-NEXT: s_mul_i32 s13, s16, s13 +; GFX9-NEXT: s_add_u32 s13, s17, s13 ; GFX9-NEXT: s_addc_u32 s17, 0, s18 -; GFX9-NEXT: s_add_u32 s15, s15, s16 -; GFX9-NEXT: s_addc_u32 s14, s14, s17 -; GFX9-NEXT: s_mul_i32 s16, s12, s14 -; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 -; GFX9-NEXT: s_add_i32 s16, s17, s16 -; GFX9-NEXT: s_mul_i32 s13, s13, s15 -; GFX9-NEXT: s_add_i32 s16, s16, s13 -; GFX9-NEXT: s_mul_i32 s12, s12, s15 -; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12 -; GFX9-NEXT: s_mul_i32 s18, s14, s12 -; GFX9-NEXT: s_mul_i32 s20, s15, s16 -; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12 -; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16 -; GFX9-NEXT: s_add_u32 s12, s12, s20 +; GFX9-NEXT: s_add_u32 s18, s12, s13 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_addc_u32 s16, s16, s17 +; GFX9-NEXT: s_mul_i32 s12, s14, s16 +; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s15, s15, s18 +; GFX9-NEXT: s_add_i32 s12, s12, s15 +; GFX9-NEXT: s_mul_i32 s14, s14, s18 +; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14 +; GFX9-NEXT: s_mul_i32 s17, s16, s14 +; GFX9-NEXT: s_mul_i32 s20, s18, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14 +; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12 +; GFX9-NEXT: s_add_u32 s14, s14, s20 ; GFX9-NEXT: s_addc_u32 s19, 0, s19 -; GFX9-NEXT: s_add_u32 s12, s12, s18 -; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16 -; GFX9-NEXT: s_addc_u32 s12, s19, s17 +; GFX9-NEXT: s_add_u32 s14, s14, s17 +; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12 +; GFX9-NEXT: s_addc_u32 s14, s19, s15 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_mul_i32 s16, s14, s16 -; GFX9-NEXT: s_add_u32 s12, s12, s16 -; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_add_u32 s15, s15, s12 -; GFX9-NEXT: s_addc_u32 s14, s14, s13 +; GFX9-NEXT: s_mul_i32 s12, s16, s12 +; GFX9-NEXT: s_add_u32 s12, s14, s12 +; GFX9-NEXT: s_addc_u32 s14, 0, s13 +; GFX9-NEXT: s_add_u32 s15, s18, s12 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_addc_u32 s14, s16, s14 ; GFX9-NEXT: s_ashr_i32 s12, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s12 ; GFX9-NEXT: s_mov_b32 s13, s12 @@ -8668,35 +8691,38 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_addc_u32 s15, s16, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 ; GFX9-NEXT: s_mul_i32 s14, s9, s14 -; GFX9-NEXT: s_add_u32 s17, s15, s14 -; GFX9-NEXT: s_addc_u32 s16, 0, s16 -; GFX9-NEXT: s_mul_i32 s14, s6, s16 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17 +; GFX9-NEXT: s_add_u32 s18, s15, s14 +; GFX9-NEXT: s_addc_u32 s19, 0, s16 +; GFX9-NEXT: s_mul_i32 s14, s6, s19 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18 ; GFX9-NEXT: s_add_i32 s14, s15, s14 -; GFX9-NEXT: s_mul_i32 s15, s7, s17 -; GFX9-NEXT: s_add_i32 s18, s14, s15 -; GFX9-NEXT: s_sub_i32 s19, s9, s18 -; GFX9-NEXT: s_mul_i32 s14, s6, s17 +; GFX9-NEXT: s_mul_i32 s15, s7, s18 +; GFX9-NEXT: s_add_i32 s20, s14, s15 +; GFX9-NEXT: s_sub_i32 s16, s9, s20 +; GFX9-NEXT: s_mul_i32 s14, s6, s18 ; GFX9-NEXT: s_sub_u32 s8, s8, s14 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX9-NEXT: s_subb_u32 s19, s19, s7 -; GFX9-NEXT: s_sub_u32 s20, s8, s6 -; GFX9-NEXT: s_subb_u32 s19, s19, 0 -; GFX9-NEXT: s_cmp_ge_u32 s19, s7 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-NEXT: s_subb_u32 s21, s16, s7 +; GFX9-NEXT: s_sub_u32 s22, s8, s6 +; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GFX9-NEXT: s_subb_u32 s16, s21, 0 +; GFX9-NEXT: s_cmp_ge_u32 s16, s7 +; GFX9-NEXT: s_cselect_b32 s17, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s22, s6 ; GFX9-NEXT: s_cselect_b32 s21, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s20, s6 -; GFX9-NEXT: s_cselect_b32 s20, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s19, s7 -; GFX9-NEXT: s_cselect_b32 s19, s20, s21 -; GFX9-NEXT: s_add_u32 s20, s17, 1 -; GFX9-NEXT: s_addc_u32 s21, s16, 0 -; GFX9-NEXT: s_add_u32 s22, s17, 2 -; GFX9-NEXT: s_addc_u32 s23, s16, 0 -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b32 s19, s22, s20 -; GFX9-NEXT: s_cselect_b32 s20, s23, s21 +; GFX9-NEXT: s_cmp_eq_u32 s16, s7 +; GFX9-NEXT: s_cselect_b32 s16, s21, s17 +; GFX9-NEXT: s_add_u32 s17, s18, 1 +; GFX9-NEXT: s_addc_u32 s21, s19, 0 +; GFX9-NEXT: s_add_u32 s22, s18, 2 +; GFX9-NEXT: s_addc_u32 s23, s19, 0 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b32 s16, s22, s17 +; GFX9-NEXT: s_cselect_b32 s17, s23, s21 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s18 +; GFX9-NEXT: s_subb_u32 s9, s9, s20 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8704,12 +8730,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s14 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s20, s16 -; GFX9-NEXT: s_cselect_b32 s6, s19, s17 +; GFX9-NEXT: s_cselect_b32 s7, s17, s19 +; GFX9-NEXT: s_cselect_b32 s6, s16, s18 ; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GFX9-NEXT: s_sub_u32 s12, s6, s2 -; GFX9-NEXT: s_subb_u32 s13, s7, s3 +; GFX9-NEXT: s_sub_u32 s14, s6, s2 +; GFX9-NEXT: s_subb_u32 s15, s7, s3 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_mov_b32 s3, s2 @@ -8718,8 +8744,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s4, 0, s6 -; GFX9-NEXT: s_subb_u32 s5, 0, s7 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8729,98 +8755,105 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: v_readfirstlane_b32 s15, v2 -; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8 -; GFX9-NEXT: s_mul_i32 s16, s4, s15 -; GFX9-NEXT: s_mul_i32 s9, s5, s8 -; GFX9-NEXT: s_add_i32 s14, s14, s16 -; GFX9-NEXT: s_add_i32 s14, s14, s9 -; GFX9-NEXT: s_mul_i32 s17, s4, s8 -; GFX9-NEXT: s_mul_i32 s16, s8, s14 -; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17 -; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s13, v2 +; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 +; GFX9-NEXT: s_mul_i32 s16, s8, s13 +; GFX9-NEXT: s_mul_i32 s5, s9, s4 +; GFX9-NEXT: s_add_i32 s12, s12, s16 +; GFX9-NEXT: s_add_i32 s12, s12, s5 +; GFX9-NEXT: s_mul_i32 s17, s8, s4 +; GFX9-NEXT: s_mul_i32 s16, s4, s12 +; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12 ; GFX9-NEXT: s_add_u32 s16, s18, s16 -; GFX9-NEXT: s_addc_u32 s9, 0, s9 -; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17 -; GFX9-NEXT: s_mul_i32 s17, s15, s17 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17 +; GFX9-NEXT: s_mul_i32 s17, s13, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14 -; GFX9-NEXT: s_addc_u32 s9, s9, s19 +; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12 +; GFX9-NEXT: s_addc_u32 s5, s5, s19 ; GFX9-NEXT: s_addc_u32 s16, s18, 0 -; GFX9-NEXT: s_mul_i32 s14, s15, s14 -; GFX9-NEXT: s_add_u32 s9, s9, s14 -; GFX9-NEXT: s_addc_u32 s14, 0, s16 -; GFX9-NEXT: s_add_u32 s8, s8, s9 -; GFX9-NEXT: s_addc_u32 s9, s15, s14 -; GFX9-NEXT: s_mul_i32 s14, s4, s9 -; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 -; GFX9-NEXT: s_add_i32 s14, s15, s14 -; GFX9-NEXT: s_mul_i32 s5, s5, s8 -; GFX9-NEXT: s_add_i32 s14, s14, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4 -; GFX9-NEXT: s_mul_i32 s16, s9, s4 -; GFX9-NEXT: s_mul_i32 s18, s8, s14 -; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14 -; GFX9-NEXT: s_add_u32 s4, s4, s18 +; GFX9-NEXT: s_mul_i32 s12, s13, s12 +; GFX9-NEXT: s_add_u32 s5, s5, s12 +; GFX9-NEXT: s_addc_u32 s12, 0, s16 +; GFX9-NEXT: s_add_u32 s16, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s4, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s9, s9, s16 +; GFX9-NEXT: s_add_i32 s4, s4, s9 +; GFX9-NEXT: s_mul_i32 s8, s8, s16 +; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 +; GFX9-NEXT: s_mul_i32 s13, s12, s8 +; GFX9-NEXT: s_mul_i32 s18, s16, s4 +; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8 +; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4 +; GFX9-NEXT: s_add_u32 s8, s8, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s4, s4, s16 -; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14 -; GFX9-NEXT: s_addc_u32 s4, s17, s15 +; GFX9-NEXT: s_add_u32 s8, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 +; GFX9-NEXT: s_addc_u32 s8, s17, s9 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s14, s9, s14 -; GFX9-NEXT: s_add_u32 s4, s4, s14 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_add_u32 s14, s8, s4 -; GFX9-NEXT: s_addc_u32 s15, s9, s5 +; GFX9-NEXT: s_mul_i32 s4, s12, s4 +; GFX9-NEXT: s_add_u32 s4, s8, s4 +; GFX9-NEXT: s_addc_u32 s8, 0, s5 +; GFX9-NEXT: s_add_u32 s13, s16, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s12, s12, s8 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s8, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s9, s11, s4 ; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s8, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14 -; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15 +; GFX9-NEXT: s_mul_i32 s11, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12 ; GFX9-NEXT: s_add_u32 s11, s16, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14 -; GFX9-NEXT: s_mul_i32 s14, s9, s14 -; GFX9-NEXT: s_add_u32 s11, s11, s14 -; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13 +; GFX9-NEXT: s_mul_i32 s13, s9, s13 +; GFX9-NEXT: s_add_u32 s11, s11, s13 +; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12 ; GFX9-NEXT: s_addc_u32 s10, s10, s17 ; GFX9-NEXT: s_addc_u32 s11, s16, 0 -; GFX9-NEXT: s_mul_i32 s14, s9, s15 -; GFX9-NEXT: s_add_u32 s14, s10, s14 -; GFX9-NEXT: s_addc_u32 s15, 0, s11 -; GFX9-NEXT: s_mul_i32 s10, s6, s15 -; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14 +; GFX9-NEXT: s_mul_i32 s12, s9, s12 +; GFX9-NEXT: s_add_u32 s16, s10, s12 +; GFX9-NEXT: s_addc_u32 s17, 0, s11 +; GFX9-NEXT: s_mul_i32 s10, s6, s17 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16 ; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s11, s7, s14 -; GFX9-NEXT: s_add_i32 s16, s10, s11 -; GFX9-NEXT: s_sub_i32 s17, s9, s16 -; GFX9-NEXT: s_mul_i32 s10, s6, s14 +; GFX9-NEXT: s_mul_i32 s11, s7, s16 +; GFX9-NEXT: s_add_i32 s18, s10, s11 +; GFX9-NEXT: s_sub_i32 s12, s9, s18 +; GFX9-NEXT: s_mul_i32 s10, s6, s16 ; GFX9-NEXT: s_sub_u32 s8, s8, s10 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX9-NEXT: s_subb_u32 s17, s17, s7 -; GFX9-NEXT: s_sub_u32 s18, s8, s6 -; GFX9-NEXT: s_subb_u32 s17, s17, 0 -; GFX9-NEXT: s_cmp_ge_u32 s17, s7 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s19, s12, s7 +; GFX9-NEXT: s_sub_u32 s20, s8, s6 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_subb_u32 s12, s19, 0 +; GFX9-NEXT: s_cmp_ge_u32 s12, s7 +; GFX9-NEXT: s_cselect_b32 s13, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s20, s6 ; GFX9-NEXT: s_cselect_b32 s19, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s18, s6 -; GFX9-NEXT: s_cselect_b32 s18, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s17, s7 -; GFX9-NEXT: s_cselect_b32 s17, s18, s19 -; GFX9-NEXT: s_add_u32 s18, s14, 1 -; GFX9-NEXT: s_addc_u32 s19, s15, 0 -; GFX9-NEXT: s_add_u32 s20, s14, 2 -; GFX9-NEXT: s_addc_u32 s21, s15, 0 -; GFX9-NEXT: s_cmp_lg_u32 s17, 0 -; GFX9-NEXT: s_cselect_b32 s17, s20, s18 -; GFX9-NEXT: s_cselect_b32 s18, s21, s19 +; GFX9-NEXT: s_cmp_eq_u32 s12, s7 +; GFX9-NEXT: s_cselect_b32 s12, s19, s13 +; GFX9-NEXT: s_add_u32 s13, s16, 1 +; GFX9-NEXT: s_addc_u32 s19, s17, 0 +; GFX9-NEXT: s_add_u32 s20, s16, 2 +; GFX9-NEXT: s_addc_u32 s21, s17, 0 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b32 s12, s20, s13 +; GFX9-NEXT: s_cselect_b32 s13, s21, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s9, s9, s16 +; GFX9-NEXT: s_subb_u32 s9, s9, s18 ; GFX9-NEXT: s_cmp_ge_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s10, -1, 0 ; GFX9-NEXT: s_cmp_ge_u32 s8, s6 @@ -8828,14 +8861,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s9, s7 ; GFX9-NEXT: s_cselect_b32 s6, s6, s10 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s7, s18, s15 -; GFX9-NEXT: s_cselect_b32 s6, s17, s14 +; GFX9-NEXT: s_cselect_b32 s7, s13, s17 +; GFX9-NEXT: s_cselect_b32 s6, s12, s16 ; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] ; GFX9-NEXT: s_sub_u32 s2, s4, s2 ; GFX9-NEXT: s_subb_u32 s3, s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9056,9 +9089,10 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_addc_u32 s13, 0, s14 ; GFX6-NEXT: s_add_u32 s14, s0, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s13 ; GFX6-NEXT: s_mul_i32 s0, s10, s12 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 @@ -9089,6 +9123,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_add_u32 s13, s14, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_addc_u32 s12, s12, s10 ; GFX6-NEXT: s_ashr_i32 s10, s7, 31 ; GFX6-NEXT: s_add_u32 s0, s6, s10 @@ -9123,43 +9158,46 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_readfirstlane_b32 s5, v0 ; GFX6-NEXT: s_add_i32 s4, s5, s4 ; GFX6-NEXT: s_mul_i32 s5, s9, s12 -; GFX6-NEXT: s_add_i32 s14, s4, s5 -; GFX6-NEXT: s_sub_i32 s13, s7, s14 +; GFX6-NEXT: s_add_i32 s13, s4, s5 +; GFX6-NEXT: s_sub_i32 s14, s7, s13 ; GFX6-NEXT: s_mul_i32 s4, s8, s12 ; GFX6-NEXT: s_sub_u32 s6, s6, s4 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s4, s5 -; GFX6-NEXT: s_subb_u32 s15, s13, s9 -; GFX6-NEXT: s_sub_u32 s16, s6, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s17, s12, s13 -; GFX6-NEXT: s_subb_u32 s17, s15, 0 -; GFX6-NEXT: s_cmp_ge_u32 s17, s9 -; GFX6-NEXT: s_cselect_b32 s18, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s16, s8 -; GFX6-NEXT: s_cselect_b32 s19, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s17, s9 -; GFX6-NEXT: s_cselect_b32 s18, s19, s18 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s15, s15, s9 -; GFX6-NEXT: s_sub_u32 s19, s16, s8 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s12, s15, 0 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_cselect_b32 s13, s19, s16 -; GFX6-NEXT: s_cselect_b32 s12, s12, s17 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s14, s14, s9 +; GFX6-NEXT: s_sub_u32 s15, s6, s8 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX6-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NEXT: s_subb_u32 s4, s7, s14 -; GFX6-NEXT: s_cmp_ge_u32 s4, s9 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s16, s14, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s9 ; GFX6-NEXT: s_cselect_b32 s5, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s6, s8 +; GFX6-NEXT: s_cmp_ge_u32 s15, s8 +; GFX6-NEXT: s_cselect_b32 s17, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s16, s9 +; GFX6-NEXT: s_cselect_b32 s17, s17, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s14, s14, s9 +; GFX6-NEXT: s_sub_u32 s18, s15, s8 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s4, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s4, s14, 0 +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b32 s14, s18, s15 +; GFX6-NEXT: s_cselect_b32 s4, s4, s16 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s5, s7, s13 +; GFX6-NEXT: s_cmp_ge_u32 s5, s9 ; GFX6-NEXT: s_cselect_b32 s7, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, s9 -; GFX6-NEXT: s_cselect_b32 s5, s7, s5 -; GFX6-NEXT: s_cmp_lg_u32 s5, 0 -; GFX6-NEXT: s_cselect_b32 s5, s12, s4 -; GFX6-NEXT: s_cselect_b32 s4, s13, s6 +; GFX6-NEXT: s_cmp_ge_u32 s6, s8 +; GFX6-NEXT: s_cselect_b32 s8, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s5, s9 +; GFX6-NEXT: s_cselect_b32 s7, s8, s7 +; GFX6-NEXT: s_cmp_lg_u32 s7, 0 +; GFX6-NEXT: s_cselect_b32 s5, s4, s5 +; GFX6-NEXT: s_cselect_b32 s4, s14, s6 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 @@ -9181,8 +9219,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s4, 0, s6 -; GFX9-NEXT: s_subb_u32 s5, 0, s7 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9192,52 +9230,56 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v1 -; GFX9-NEXT: s_mul_i32 s10, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 -; GFX9-NEXT: s_mul_i32 s11, s5, s9 -; GFX9-NEXT: s_add_i32 s10, s12, s10 -; GFX9-NEXT: s_mul_i32 s13, s4, s9 -; GFX9-NEXT: s_add_i32 s10, s10, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13 -; GFX9-NEXT: s_mul_i32 s14, s9, s10 -; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mul_i32 s5, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 +; GFX9-NEXT: s_mul_i32 s11, s9, s4 +; GFX9-NEXT: s_add_i32 s5, s12, s5 +; GFX9-NEXT: s_mul_i32 s13, s8, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13 +; GFX9-NEXT: s_mul_i32 s14, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5 ; GFX9-NEXT: s_add_u32 s12, s12, s14 ; GFX9-NEXT: s_addc_u32 s11, 0, s11 -; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13 -; GFX9-NEXT: s_mul_i32 s13, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13 +; GFX9-NEXT: s_mul_i32 s13, s10, s13 ; GFX9-NEXT: s_add_u32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5 ; GFX9-NEXT: s_addc_u32 s11, s11, s15 ; GFX9-NEXT: s_addc_u32 s12, s14, 0 -; GFX9-NEXT: s_mul_i32 s10, s8, s10 -; GFX9-NEXT: s_add_u32 s10, s11, s10 +; GFX9-NEXT: s_mul_i32 s5, s10, s5 +; GFX9-NEXT: s_add_u32 s5, s11, s5 ; GFX9-NEXT: s_addc_u32 s11, 0, s12 -; GFX9-NEXT: s_add_u32 s9, s9, s10 -; GFX9-NEXT: s_addc_u32 s8, s8, s11 -; GFX9-NEXT: s_mul_i32 s10, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9 -; GFX9-NEXT: s_add_i32 s10, s11, s10 -; GFX9-NEXT: s_mul_i32 s5, s5, s9 -; GFX9-NEXT: s_add_i32 s10, s10, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s9 -; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4 -; GFX9-NEXT: s_mul_i32 s12, s8, s4 -; GFX9-NEXT: s_mul_i32 s14, s9, s10 -; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4 -; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10 -; GFX9-NEXT: s_add_u32 s4, s4, s14 +; GFX9-NEXT: s_add_u32 s12, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s10, s10, s11 +; GFX9-NEXT: s_mul_i32 s4, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s9, s9, s12 +; GFX9-NEXT: s_add_i32 s4, s4, s9 +; GFX9-NEXT: s_mul_i32 s8, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8 +; GFX9-NEXT: s_mul_i32 s11, s10, s8 +; GFX9-NEXT: s_mul_i32 s14, s12, s4 +; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 +; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4 +; GFX9-NEXT: s_add_u32 s8, s8, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_add_u32 s4, s4, s12 -; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10 -; GFX9-NEXT: s_addc_u32 s4, s13, s11 +; GFX9-NEXT: s_add_u32 s8, s8, s11 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4 +; GFX9-NEXT: s_addc_u32 s8, s13, s9 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s10, s8, s10 -; GFX9-NEXT: s_add_u32 s4, s4, s10 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_add_u32 s9, s9, s4 -; GFX9-NEXT: s_addc_u32 s8, s8, s5 +; GFX9-NEXT: s_mul_i32 s4, s10, s4 +; GFX9-NEXT: s_add_u32 s4, s8, s4 +; GFX9-NEXT: s_addc_u32 s8, 0, s5 +; GFX9-NEXT: s_add_u32 s9, s12, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s8, s10, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 @@ -9267,9 +9309,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s8, s6, s8 ; GFX9-NEXT: s_sub_u32 s2, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s13, s10, s7 ; GFX9-NEXT: s_sub_u32 s14, s2, s6 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s15, s13, 0 ; GFX9-NEXT: s_cmp_ge_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 @@ -9278,11 +9322,13 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_eq_u32 s15, s7 ; GFX9-NEXT: s_cselect_b32 s16, s17, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s13, s7 -; GFX9-NEXT: s_sub_u32 s11, s14, s6 -; GFX9-NEXT: s_subb_u32 s10, s10, 0 +; GFX9-NEXT: s_subb_u32 s13, s13, s7 +; GFX9-NEXT: s_sub_u32 s17, s14, s6 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s10, s13, 0 ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s11, s11, s14 +; GFX9-NEXT: s_cselect_b32 s11, s17, s14 ; GFX9-NEXT: s_cselect_b32 s10, s10, s15 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s3, s3, s12 @@ -9444,9 +9490,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_addc_u32 s15, 0, s16 ; GFX6-NEXT: s_add_u32 s16, s6, s7 ; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s14, s14, s15 ; GFX6-NEXT: s_mul_i32 s6, s12, s14 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 @@ -9477,6 +9524,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s13, s16, s6 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX6-NEXT: s_or_b32 s6, s6, s7 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s12, s14, s12 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s6 @@ -9509,46 +9557,49 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_add_i32 s13, s14, s13 ; GFX6-NEXT: s_mul_i32 s14, s3, s12 -; GFX6-NEXT: s_add_i32 s16, s13, s14 -; GFX6-NEXT: s_sub_i32 s14, s9, s16 +; GFX6-NEXT: s_add_i32 s14, s13, s14 +; GFX6-NEXT: s_sub_i32 s15, s9, s14 ; GFX6-NEXT: s_mul_i32 s12, s2, s12 ; GFX6-NEXT: s_sub_u32 s8, s8, s12 ; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s15, s12, s13 -; GFX6-NEXT: s_subb_u32 s17, s14, s3 -; GFX6-NEXT: s_sub_u32 s18, s8, s2 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s19, s14, s15 -; GFX6-NEXT: s_subb_u32 s19, s17, 0 -; GFX6-NEXT: s_cmp_ge_u32 s19, s3 -; GFX6-NEXT: s_cselect_b32 s20, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s2 -; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s19, s3 -; GFX6-NEXT: s_cselect_b32 s20, s21, s20 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s17, s17, s3 -; GFX6-NEXT: s_sub_u32 s21, s18, s2 -; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GFX6-NEXT: s_or_b32 s14, s14, s15 -; GFX6-NEXT: s_subb_u32 s14, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b32 s15, s21, s18 -; GFX6-NEXT: s_cselect_b32 s14, s14, s19 +; GFX6-NEXT: s_or_b32 s16, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s3 +; GFX6-NEXT: s_sub_u32 s17, s8, s2 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s18, s15, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s3 +; GFX6-NEXT: s_cselect_b32 s13, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s2 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s18, s3 +; GFX6-NEXT: s_cselect_b32 s19, s19, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s3 +; GFX6-NEXT: s_sub_u32 s20, s17, s2 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s9, s9, s16 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s12, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s13, s20, s17 +; GFX6-NEXT: s_cselect_b32 s12, s12, s18 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s9, s9, s14 ; GFX6-NEXT: s_cmp_ge_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s12, -1, 0 +; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s2 ; GFX6-NEXT: s_cselect_b32 s2, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s3 -; GFX6-NEXT: s_cselect_b32 s2, s2, s12 +; GFX6-NEXT: s_cselect_b32 s2, s2, s14 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_cselect_b32 s3, s14, s9 -; GFX6-NEXT: s_cselect_b32 s2, s15, s8 +; GFX6-NEXT: s_cselect_b32 s3, s12, s9 +; GFX6-NEXT: s_cselect_b32 s2, s13, s8 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: s_sub_u32 s14, s2, s6 -; GFX6-NEXT: s_subb_u32 s15, s3, s6 +; GFX6-NEXT: s_sub_u32 s12, s2, s6 +; GFX6-NEXT: s_subb_u32 s13, s3, s6 ; GFX6-NEXT: s_ashr_i32 s2, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s2 ; GFX6-NEXT: s_mov_b32 s3, s2 @@ -9567,39 +9618,40 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s1, s8, s12 +; GFX6-NEXT: s_mul_i32 s1, s8, s14 ; GFX6-NEXT: v_readfirstlane_b32 s3, v2 ; GFX6-NEXT: s_mul_i32 s0, s9, s2 ; GFX6-NEXT: s_add_i32 s1, s3, s1 ; GFX6-NEXT: s_add_i32 s3, s1, s0 -; GFX6-NEXT: s_mul_i32 s13, s8, s2 +; GFX6-NEXT: s_mul_i32 s15, s8, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mul_i32 s4, s2, s3 ; GFX6-NEXT: v_readfirstlane_b32 s5, v2 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 ; GFX6-NEXT: s_add_u32 s4, s16, s4 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 -; GFX6-NEXT: s_mul_i32 s13, s12, s13 +; GFX6-NEXT: s_mul_i32 s15, s14, s15 ; GFX6-NEXT: v_readfirstlane_b32 s16, v0 -; GFX6-NEXT: s_add_u32 s4, s4, s13 +; GFX6-NEXT: s_add_u32 s4, s4, s15 ; GFX6-NEXT: s_addc_u32 s4, s5, s16 ; GFX6-NEXT: v_readfirstlane_b32 s5, v1 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_mul_i32 s3, s12, s3 +; GFX6-NEXT: s_mul_i32 s3, s14, s3 ; GFX6-NEXT: s_add_u32 s3, s4, s3 ; GFX6-NEXT: s_addc_u32 s4, 0, s5 ; GFX6-NEXT: s_add_u32 s5, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s4, s12, s4 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s4, s14, s4 ; GFX6-NEXT: s_mul_i32 s2, s8, s4 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_add_i32 s2, s3, s2 @@ -9613,98 +9665,102 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mul_i32 s9, s5, s2 -; GFX6-NEXT: v_readfirstlane_b32 s13, v2 -; GFX6-NEXT: s_add_u32 s9, s13, s9 -; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s9, s15, s9 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 ; GFX6-NEXT: s_mul_i32 s3, s4, s3 -; GFX6-NEXT: s_addc_u32 s12, 0, s12 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 ; GFX6-NEXT: v_readfirstlane_b32 s8, v3 ; GFX6-NEXT: s_add_u32 s3, s9, s3 -; GFX6-NEXT: s_addc_u32 s3, s12, s8 +; GFX6-NEXT: s_addc_u32 s3, s14, s8 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 ; GFX6-NEXT: s_addc_u32 s8, s8, 0 ; GFX6-NEXT: s_mul_i32 s2, s4, s2 ; GFX6-NEXT: s_add_u32 s2, s3, s2 ; GFX6-NEXT: s_addc_u32 s8, 0, s8 -; GFX6-NEXT: s_add_u32 s12, s5, s2 +; GFX6-NEXT: s_add_u32 s14, s5, s2 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_addc_u32 s13, s4, s8 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s15, s4, s8 ; GFX6-NEXT: s_ashr_i32 s4, s11, 31 ; GFX6-NEXT: s_add_u32 s2, s10, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s11, s4 ; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s15 ; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2 -; GFX6-NEXT: s_mul_i32 s2, s8, s13 +; GFX6-NEXT: s_mul_i32 s2, s8, s15 ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: s_add_u32 s2, s11, s2 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 -; GFX6-NEXT: s_mul_i32 s11, s9, s12 -; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_mul_i32 s11, s9, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 ; GFX6-NEXT: s_add_u32 s2, s2, s11 -; GFX6-NEXT: s_addc_u32 s2, s10, s12 +; GFX6-NEXT: s_addc_u32 s2, s10, s14 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 ; GFX6-NEXT: s_addc_u32 s10, s10, 0 -; GFX6-NEXT: s_mul_i32 s11, s9, s13 +; GFX6-NEXT: s_mul_i32 s11, s9, s15 ; GFX6-NEXT: s_add_u32 s11, s2, s11 ; GFX6-NEXT: v_mov_b32_e32 v0, s11 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: s_addc_u32 s10, 0, s10 ; GFX6-NEXT: s_mul_i32 s10, s6, s10 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_readfirstlane_b32 s12, v0 -; GFX6-NEXT: s_add_i32 s10, s12, s10 -; GFX6-NEXT: s_mul_i32 s12, s7, s11 -; GFX6-NEXT: s_add_i32 s16, s10, s12 -; GFX6-NEXT: s_sub_i32 s12, s9, s16 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_add_i32 s10, s14, s10 +; GFX6-NEXT: s_mul_i32 s14, s7, s11 +; GFX6-NEXT: s_add_i32 s14, s10, s14 +; GFX6-NEXT: s_sub_i32 s15, s9, s14 ; GFX6-NEXT: s_mul_i32 s10, s6, s11 ; GFX6-NEXT: s_sub_u32 s8, s8, s10 ; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX6-NEXT: s_or_b32 s13, s10, s11 -; GFX6-NEXT: s_subb_u32 s17, s12, s7 -; GFX6-NEXT: s_sub_u32 s18, s8, s6 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s19, s12, s13 -; GFX6-NEXT: s_subb_u32 s19, s17, 0 -; GFX6-NEXT: s_cmp_ge_u32 s19, s7 -; GFX6-NEXT: s_cselect_b32 s20, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s18, s6 -; GFX6-NEXT: s_cselect_b32 s21, -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s19, s7 -; GFX6-NEXT: s_cselect_b32 s20, s21, s20 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s17, s17, s7 -; GFX6-NEXT: s_sub_u32 s21, s18, s6 -; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX6-NEXT: s_or_b32 s12, s12, s13 -; GFX6-NEXT: s_subb_u32 s12, s17, 0 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b32 s13, s21, s18 -; GFX6-NEXT: s_cselect_b32 s12, s12, s19 +; GFX6-NEXT: s_or_b32 s16, s10, s11 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s7 +; GFX6-NEXT: s_sub_u32 s17, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GFX6-NEXT: s_or_b32 s10, s10, s11 -; GFX6-NEXT: s_subb_u32 s9, s9, s16 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_subb_u32 s18, s15, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s7 +; GFX6-NEXT: s_cselect_b32 s11, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s6 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s18, s7 +; GFX6-NEXT: s_cselect_b32 s19, s19, s11 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s7 +; GFX6-NEXT: s_sub_u32 s20, s17, s6 +; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX6-NEXT: s_or_b32 s10, s10, s11 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_subb_u32 s10, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s11, s20, s17 +; GFX6-NEXT: s_cselect_b32 s10, s10, s18 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s9, s9, s14 ; GFX6-NEXT: s_cmp_ge_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s10, -1, 0 +; GFX6-NEXT: s_cselect_b32 s14, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s9, s7 -; GFX6-NEXT: s_cselect_b32 s6, s6, s10 +; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_cselect_b32 s7, s12, s9 -; GFX6-NEXT: s_cselect_b32 s6, s13, s8 +; GFX6-NEXT: s_cselect_b32 s7, s10, s9 +; GFX6-NEXT: s_cselect_b32 s6, s11, s8 ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_sub_u32 s5, s6, s4 ; GFX6-NEXT: s_subb_u32 s4, s7, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9724,8 +9780,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_u32 s6, 0, s2 -; GFX9-NEXT: s_subb_u32 s7, 0, s3 +; GFX9-NEXT: s_sub_u32 s12, 0, s2 +; GFX9-NEXT: s_subb_u32 s13, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9734,52 +9790,56 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: v_readfirstlane_b32 s13, v0 -; GFX9-NEXT: s_mul_i32 s14, s6, s12 -; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13 -; GFX9-NEXT: s_mul_i32 s15, s7, s13 -; GFX9-NEXT: s_add_i32 s14, s16, s14 -; GFX9-NEXT: s_mul_i32 s17, s6, s13 -; GFX9-NEXT: s_add_i32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17 -; GFX9-NEXT: s_mul_i32 s18, s13, s14 -; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14 +; GFX9-NEXT: v_readfirstlane_b32 s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s7, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6 +; GFX9-NEXT: s_mul_i32 s15, s13, s6 +; GFX9-NEXT: s_add_i32 s7, s16, s7 +; GFX9-NEXT: s_mul_i32 s17, s12, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17 +; GFX9-NEXT: s_mul_i32 s18, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7 ; GFX9-NEXT: s_add_u32 s16, s16, s18 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17 -; GFX9-NEXT: s_mul_i32 s17, s12, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17 +; GFX9-NEXT: s_mul_i32 s17, s14, s17 ; GFX9-NEXT: s_add_u32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7 ; GFX9-NEXT: s_addc_u32 s15, s15, s18 ; GFX9-NEXT: s_addc_u32 s16, s19, 0 -; GFX9-NEXT: s_mul_i32 s14, s12, s14 -; GFX9-NEXT: s_add_u32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s7, s14, s7 +; GFX9-NEXT: s_add_u32 s7, s15, s7 ; GFX9-NEXT: s_addc_u32 s15, 0, s16 -; GFX9-NEXT: s_add_u32 s13, s13, s14 -; GFX9-NEXT: s_addc_u32 s12, s12, s15 -; GFX9-NEXT: s_mul_i32 s14, s6, s12 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13 -; GFX9-NEXT: s_add_i32 s14, s15, s14 -; GFX9-NEXT: s_mul_i32 s7, s7, s13 -; GFX9-NEXT: s_add_i32 s14, s14, s7 -; GFX9-NEXT: s_mul_i32 s6, s6, s13 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6 -; GFX9-NEXT: s_mul_i32 s16, s12, s6 -; GFX9-NEXT: s_mul_i32 s18, s13, s14 -; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6 -; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14 -; GFX9-NEXT: s_add_u32 s6, s6, s18 +; GFX9-NEXT: s_add_u32 s16, s6, s7 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-NEXT: s_addc_u32 s14, s14, s15 +; GFX9-NEXT: s_mul_i32 s6, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16 +; GFX9-NEXT: s_add_i32 s6, s7, s6 +; GFX9-NEXT: s_mul_i32 s13, s13, s16 +; GFX9-NEXT: s_add_i32 s6, s6, s13 +; GFX9-NEXT: s_mul_i32 s12, s12, s16 +; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12 +; GFX9-NEXT: s_mul_i32 s15, s14, s12 +; GFX9-NEXT: s_mul_i32 s18, s16, s6 +; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12 +; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6 +; GFX9-NEXT: s_add_u32 s12, s12, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_add_u32 s6, s6, s16 -; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14 -; GFX9-NEXT: s_addc_u32 s6, s17, s15 +; GFX9-NEXT: s_add_u32 s12, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6 +; GFX9-NEXT: s_addc_u32 s12, s17, s13 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_mul_i32 s14, s12, s14 -; GFX9-NEXT: s_add_u32 s6, s6, s14 -; GFX9-NEXT: s_addc_u32 s7, 0, s7 -; GFX9-NEXT: s_add_u32 s13, s13, s6 -; GFX9-NEXT: s_addc_u32 s12, s12, s7 +; GFX9-NEXT: s_mul_i32 s6, s14, s6 +; GFX9-NEXT: s_add_u32 s6, s12, s6 +; GFX9-NEXT: s_addc_u32 s12, 0, s7 +; GFX9-NEXT: s_add_u32 s13, s16, s6 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-NEXT: s_addc_u32 s12, s14, s12 ; GFX9-NEXT: s_ashr_i32 s6, s9, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s6 ; GFX9-NEXT: s_mov_b32 s7, s6 @@ -9808,9 +9868,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s12, s2, s12 ; GFX9-NEXT: s_sub_u32 s8, s8, s12 ; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s17, s14, s3 ; GFX9-NEXT: s_sub_u32 s18, s8, s2 ; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GFX9-NEXT: s_subb_u32 s19, s17, 0 ; GFX9-NEXT: s_cmp_ge_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, -1, 0 @@ -9819,11 +9881,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s19, s3 ; GFX9-NEXT: s_cselect_b32 s20, s21, s20 ; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 -; GFX9-NEXT: s_subb_u32 s14, s17, s3 -; GFX9-NEXT: s_sub_u32 s15, s18, s2 -; GFX9-NEXT: s_subb_u32 s14, s14, 0 +; GFX9-NEXT: s_subb_u32 s17, s17, s3 +; GFX9-NEXT: s_sub_u32 s21, s18, s2 +; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-NEXT: s_subb_u32 s14, s17, 0 ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b32 s15, s15, s18 +; GFX9-NEXT: s_cselect_b32 s15, s21, s18 ; GFX9-NEXT: s_cselect_b32 s14, s14, s19 ; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_subb_u32 s9, s9, s16 @@ -9847,8 +9911,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_sub_u32 s4, 0, s2 -; GFX9-NEXT: s_subb_u32 s5, 0, s3 +; GFX9-NEXT: s_sub_u32 s6, 0, s2 +; GFX9-NEXT: s_subb_u32 s7, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9858,70 +9922,74 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s9, v2 -; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 -; GFX9-NEXT: s_mul_i32 s14, s4, s9 -; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4 +; GFX9-NEXT: s_mul_i32 s14, s6, s9 +; GFX9-NEXT: s_mul_i32 s5, s7, s4 ; GFX9-NEXT: s_add_i32 s8, s8, s14 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s15, s4, s6 -; GFX9-NEXT: s_mul_i32 s14, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15 -; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_i32 s15, s6, s4 +; GFX9-NEXT: s_mul_i32 s14, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 ; GFX9-NEXT: s_add_u32 s14, s16, s14 -; GFX9-NEXT: s_addc_u32 s7, 0, s7 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 ; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15 ; GFX9-NEXT: s_mul_i32 s15, s9, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 ; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8 -; GFX9-NEXT: s_addc_u32 s7, s7, s17 +; GFX9-NEXT: s_addc_u32 s5, s5, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 ; GFX9-NEXT: s_mul_i32 s8, s9, s8 -; GFX9-NEXT: s_add_u32 s7, s7, s8 +; GFX9-NEXT: s_add_u32 s5, s5, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s14 -; GFX9-NEXT: s_add_u32 s6, s6, s7 -; GFX9-NEXT: s_addc_u32 s7, s9, s8 -; GFX9-NEXT: s_mul_i32 s8, s4, s7 -; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6 -; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s6 -; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4 -; GFX9-NEXT: s_mul_i32 s14, s7, s4 -; GFX9-NEXT: s_mul_i32 s16, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4 -; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8 -; GFX9-NEXT: s_add_u32 s4, s4, s16 +; GFX9-NEXT: s_add_u32 s14, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s4, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s7, s7, s14 +; GFX9-NEXT: s_add_i32 s4, s4, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s14 +; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6 +; GFX9-NEXT: s_mul_i32 s9, s8, s6 +; GFX9-NEXT: s_mul_i32 s16, s14, s4 +; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6 +; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 +; GFX9-NEXT: s_add_u32 s6, s6, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s4, s4, s14 -; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8 -; GFX9-NEXT: s_addc_u32 s4, s15, s9 +; GFX9-NEXT: s_add_u32 s6, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4 +; GFX9-NEXT: s_addc_u32 s6, s15, s7 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_mul_i32 s8, s7, s8 -; GFX9-NEXT: s_add_u32 s4, s4, s8 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_add_u32 s8, s6, s4 -; GFX9-NEXT: s_addc_u32 s9, s7, s5 +; GFX9-NEXT: s_mul_i32 s4, s8, s4 +; GFX9-NEXT: s_add_u32 s4, s6, s4 +; GFX9-NEXT: s_addc_u32 s6, 0, s5 +; GFX9-NEXT: s_add_u32 s9, s14, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s8, s8, s6 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 ; GFX9-NEXT: s_add_u32 s6, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s7, s11, s4 ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] -; GFX9-NEXT: s_mul_i32 s11, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9 +; GFX9-NEXT: s_mul_i32 s11, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8 ; GFX9-NEXT: s_add_u32 s11, s14, s11 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8 -; GFX9-NEXT: s_mul_i32 s8, s7, s8 -; GFX9-NEXT: s_add_u32 s8, s11, s8 -; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9 -; GFX9-NEXT: s_addc_u32 s8, s10, s15 -; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9 ; GFX9-NEXT: s_mul_i32 s9, s7, s9 -; GFX9-NEXT: s_add_u32 s8, s8, s9 +; GFX9-NEXT: s_add_u32 s9, s11, s9 +; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8 +; GFX9-NEXT: s_addc_u32 s9, s10, s15 +; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_i32 s8, s7, s8 +; GFX9-NEXT: s_add_u32 s8, s9, s8 ; GFX9-NEXT: s_addc_u32 s9, 0, s10 ; GFX9-NEXT: s_mul_i32 s9, s2, s9 ; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8 @@ -9932,9 +10000,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s8, s2, s8 ; GFX9-NEXT: s_sub_u32 s6, s6, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s15, s10, s3 ; GFX9-NEXT: s_sub_u32 s16, s6, s2 ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_subb_u32 s17, s15, 0 ; GFX9-NEXT: s_cmp_ge_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, -1, 0 @@ -9943,11 +10013,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cmp_eq_u32 s17, s3 ; GFX9-NEXT: s_cselect_b32 s18, s19, s18 ; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 -; GFX9-NEXT: s_subb_u32 s10, s15, s3 -; GFX9-NEXT: s_sub_u32 s11, s16, s2 -; GFX9-NEXT: s_subb_u32 s10, s10, 0 +; GFX9-NEXT: s_subb_u32 s15, s15, s3 +; GFX9-NEXT: s_sub_u32 s19, s16, s2 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s10, s15, 0 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cselect_b32 s11, s11, s16 +; GFX9-NEXT: s_cselect_b32 s11, s19, s16 ; GFX9-NEXT: s_cselect_b32 s10, s10, s17 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX9-NEXT: s_subb_u32 s7, s7, s14 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 01f4414..394727c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -612,11 +612,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -652,11 +653,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -691,10 +693,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -730,10 +733,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -770,10 +774,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -813,10 +818,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -853,10 +859,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -894,15 +901,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -992,11 +999,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1034,11 +1042,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1075,10 +1084,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1117,10 +1127,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1160,10 +1171,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1206,10 +1218,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1248,10 +1261,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1292,15 +1306,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2059,11 +2073,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2099,11 +2114,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2138,10 +2154,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2177,10 +2194,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2217,10 +2235,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2260,10 +2279,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2301,10 +2321,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2342,15 +2363,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 9db6d70..258bc295 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -717,11 +717,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -761,11 +762,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -803,12 +805,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -850,10 +853,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -893,13 +897,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -944,10 +949,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -987,14 +993,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1022,7 +1028,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff ; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1036,15 +1041,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2358,6 +2363,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2410,6 +2416,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2455,12 +2462,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2507,12 +2515,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2560,13 +2569,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2616,13 +2626,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2666,16 +2677,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -2720,17 +2731,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 ; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4479,11 +4490,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4538,11 +4550,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4595,12 +4608,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4656,10 +4670,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -4713,13 +4728,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4783,10 +4799,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4844,14 +4861,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -4879,7 +4896,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff ; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -4893,15 +4909,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -6657,6 +6673,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6729,6 +6746,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6794,12 +6812,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6864,12 +6883,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -6935,13 +6955,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7015,13 +7036,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7087,16 +7109,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -7141,17 +7163,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 ; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] +; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 6167a84..23c5f4f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -499,11 +499,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -539,11 +540,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -578,10 +580,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -618,10 +621,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -659,10 +663,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -702,10 +707,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1082,10 +1088,11 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1110,10 +1117,11 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1139,8 +1147,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1167,8 +1176,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1196,8 +1206,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1227,8 +1239,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2008,6 +2022,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2056,6 +2071,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2096,12 +2112,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2143,12 +2160,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2191,13 +2209,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2242,13 +2261,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2861,6 +2881,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2893,6 +2914,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2924,6 +2946,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2956,6 +2979,7 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2989,6 +3013,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3022,8 +3048,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3879,11 +3906,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3919,11 +3947,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3958,10 +3987,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3998,10 +4028,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4039,10 +4070,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4082,10 +4114,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -4462,10 +4495,11 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 -; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4490,10 +4524,11 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 -; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4519,8 +4554,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4547,8 +4583,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4576,8 +4613,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4607,8 +4646,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5411,6 +5452,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5459,6 +5501,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5499,12 +5542,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5546,12 +5590,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5594,13 +5639,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -5645,13 +5691,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -6266,11 +6313,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6306,11 +6354,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6345,10 +6394,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6385,10 +6435,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6426,10 +6477,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6469,10 +6521,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -6873,11 +6926,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6919,11 +6973,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6960,14 +7015,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7009,11 +7065,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 ; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7052,15 +7109,16 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7105,11 +7163,12 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7613,11 +7672,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7653,11 +7713,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7692,10 +7753,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7732,10 +7794,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7773,10 +7836,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -7816,10 +7880,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -8219,11 +8284,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8265,11 +8331,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8306,14 +8373,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8355,11 +8423,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 ; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8398,15 +8467,16 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8451,11 +8521,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8959,11 +9030,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8999,11 +9071,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9038,10 +9111,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9078,10 +9152,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9119,10 +9194,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9162,10 +9238,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -9565,11 +9642,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9611,11 +9689,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9652,14 +9731,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9701,11 +9781,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 ; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9744,15 +9825,16 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9797,11 +9879,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -10305,11 +10388,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10345,11 +10429,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10384,10 +10469,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10424,10 +10510,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10465,10 +10552,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -10508,10 +10596,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -11166,6 +11255,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11221,6 +11311,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11272,6 +11363,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11323,6 +11415,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11375,8 +11468,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -11431,8 +11525,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -12119,11 +12214,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12159,11 +12255,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12198,10 +12295,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12238,10 +12336,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12279,10 +12378,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -12322,10 +12422,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -12980,6 +13081,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13035,6 +13137,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13086,6 +13189,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13137,6 +13241,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13189,8 +13294,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -13245,8 +13351,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -13933,11 +14040,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -13973,11 +14081,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14012,10 +14121,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14052,10 +14162,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14093,10 +14204,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14136,10 +14248,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -14788,6 +14901,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14842,6 +14956,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14892,6 +15007,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14942,6 +15058,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14995,6 +15112,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15050,6 +15169,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15732,11 +15853,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15772,11 +15894,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15811,10 +15934,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15851,10 +15975,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 ; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -15892,10 +16017,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -15935,10 +16061,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -16588,6 +16715,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16642,6 +16770,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16692,6 +16821,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16742,6 +16872,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -16795,6 +16926,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -16850,6 +16983,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 9afc0c6..e4def28 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -611,11 +611,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -651,11 +652,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -690,10 +692,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -729,10 +732,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -769,10 +773,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -812,10 +817,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -852,10 +858,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -893,15 +900,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1658,11 +1665,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1698,11 +1706,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1737,10 +1746,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1776,10 +1786,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1816,10 +1827,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1859,10 +1871,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1900,10 +1913,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1941,15 +1955,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 10fd34f..39a3c9a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -628,11 +628,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -669,11 +670,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -709,10 +711,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -749,10 +752,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -790,10 +794,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -834,10 +839,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -874,10 +880,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -916,15 +923,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -1826,11 +1833,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s6, v0, s3 -; GFX8-NEXT: v_writelane_b32 v1, s2, m0 -; GFX8-NEXT: s_add_i32 s2, s2, s6 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1867,11 +1875,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: v_readlane_b32 s6, v0, s3 -; GFX9-NEXT: v_writelane_b32 v1, s2, m0 -; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1907,10 +1916,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 -; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 +; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1947,10 +1957,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 ; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 -; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 ; GFX10W32-NEXT: s_add_i32 s0, s0, s3 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2 +; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1988,10 +1999,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 +; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2032,10 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_add_i32 s0, s0, s3 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2073,10 +2086,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 -; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 +; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 @@ -2115,15 +2129,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: ; implicit-def: $vgpr0 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 ; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 -; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2 +; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index b96de17..4a6fa4f 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -704,6 +704,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_add_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 +; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_addc_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -724,14 +725,16 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_addc_u32 s0, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -743,10 +746,12 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s12, s14 -; GFX9-NEXT: s_addc_u32 s1, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_add_u32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_addc_u32 s0, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -759,8 +764,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s0, s12, s14 -; GFX1010-NEXT: s_addc_u32 s1, s13, s15 +; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1010-NEXT: s_addc_u32 s1, s13, s15 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -774,8 +781,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -789,8 +798,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7 +; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -803,8 +814,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_addc_u32 s5, s5, s7 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -818,8 +831,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1676,6 +1691,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; CISI-NEXT: s_sub_u32 s4, s4, s6 ; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; CISI-NEXT: s_or_b32 s6, s12, s13 +; CISI-NEXT: s_cmp_lg_u32 s6, 0 ; CISI-NEXT: s_subb_u32 s5, s5, s7 ; CISI-NEXT: s_mov_b32 s8, s0 ; CISI-NEXT: s_mov_b32 s9, s1 @@ -1696,14 +1712,16 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_subb_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_subb_u32 s0, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -1715,10 +1733,12 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s12, s14 -; GFX9-NEXT: s_subb_u32 s1, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_sub_u32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s0, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -1731,8 +1751,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s0, s12, s14 -; GFX1010-NEXT: s_subb_u32 s1, s13, s15 +; GFX1010-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 +; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1010-NEXT: s_subb_u32 s1, s13, s15 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -1746,8 +1768,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7 ; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1761,8 +1785,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6 -; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7 +; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] @@ -1775,8 +1801,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_subb_u32 s5, s5, s7 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 @@ -1790,8 +1818,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14 -; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15 +; GFX1250-NEXT: s_cselect_b32 s1, -1, 0 ; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15 ; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -2188,46 +2218,49 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: s_addc_u32 s6, s7, s9 ; VI-NEXT: s_addc_u32 s8, s8, 0 ; VI-NEXT: v_readfirstlane_b32 s7, v0 -; VI-NEXT: s_add_u32 s10, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_add_u32 s12, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0 -; VI-NEXT: s_addc_u32 s11, 0, s8 -; VI-NEXT: s_mul_i32 s8, s4, s11 +; VI-NEXT: s_addc_u32 s13, 0, s8 +; VI-NEXT: s_mul_i32 s8, s4, s13 ; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: s_add_i32 s8, s9, s8 -; VI-NEXT: s_mul_i32 s9, s5, s10 -; VI-NEXT: s_add_i32 s12, s8, s9 -; VI-NEXT: s_sub_i32 s13, s3, s12 +; VI-NEXT: s_mul_i32 s9, s5, s12 +; VI-NEXT: s_add_i32 s14, s8, s9 +; VI-NEXT: s_sub_i32 s10, s3, s14 ; VI-NEXT: v_readfirstlane_b32 s8, v0 -; VI-NEXT: s_sub_u32 s14, s2, s8 +; VI-NEXT: s_sub_u32 s15, s2, s8 ; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-NEXT: s_subb_u32 s13, s13, s5 -; VI-NEXT: s_sub_u32 s15, s14, s4 -; VI-NEXT: s_subb_u32 s13, s13, 0 -; VI-NEXT: s_cmp_ge_u32 s13, s5 +; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 +; VI-NEXT: s_subb_u32 s16, s10, s5 +; VI-NEXT: s_sub_u32 s17, s15, s4 +; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; VI-NEXT: s_cmp_lg_u64 s[10:11], 0 +; VI-NEXT: s_subb_u32 s10, s16, 0 +; VI-NEXT: s_cmp_ge_u32 s10, s5 +; VI-NEXT: s_cselect_b32 s11, -1, 0 +; VI-NEXT: s_cmp_ge_u32 s17, s4 ; VI-NEXT: s_cselect_b32 s16, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s15, s4 -; VI-NEXT: s_cselect_b32 s15, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s13, s5 -; VI-NEXT: s_cselect_b32 s13, s15, s16 -; VI-NEXT: s_add_u32 s15, s10, 1 -; VI-NEXT: s_addc_u32 s16, s11, 0 -; VI-NEXT: s_add_u32 s17, s10, 2 -; VI-NEXT: s_addc_u32 s18, s11, 0 -; VI-NEXT: s_cmp_lg_u32 s13, 0 -; VI-NEXT: s_cselect_b32 s13, s17, s15 -; VI-NEXT: s_cselect_b32 s15, s18, s16 +; VI-NEXT: s_cmp_eq_u32 s10, s5 +; VI-NEXT: s_cselect_b32 s10, s16, s11 +; VI-NEXT: s_add_u32 s11, s12, 1 +; VI-NEXT: s_addc_u32 s16, s13, 0 +; VI-NEXT: s_add_u32 s17, s12, 2 +; VI-NEXT: s_addc_u32 s18, s13, 0 +; VI-NEXT: s_cmp_lg_u32 s10, 0 +; VI-NEXT: s_cselect_b32 s10, s17, s11 +; VI-NEXT: s_cselect_b32 s11, s18, s16 ; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 -; VI-NEXT: s_subb_u32 s3, s3, s12 +; VI-NEXT: s_subb_u32 s3, s3, s14 ; VI-NEXT: s_cmp_ge_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s8, -1, 0 -; VI-NEXT: s_cmp_ge_u32 s14, s4 +; VI-NEXT: s_cmp_ge_u32 s15, s4 ; VI-NEXT: s_cselect_b32 s9, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s3, s5 ; VI-NEXT: s_cselect_b32 s3, s9, s8 ; VI-NEXT: s_cmp_lg_u32 s3, 0 -; VI-NEXT: s_cselect_b32 s9, s15, s11 -; VI-NEXT: s_cselect_b32 s8, s13, s10 +; VI-NEXT: s_cselect_b32 s9, s11, s13 +; VI-NEXT: s_cselect_b32 s8, s10, s12 ; VI-NEXT: s_cbranch_execnz .LBB16_4 ; VI-NEXT: .LBB16_2: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2278,8 +2311,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: s_sub_u32 s10, 0, s6 +; GFX9-NEXT: s_subb_u32 s11, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2288,102 +2321,109 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s10, v1 -; GFX9-NEXT: v_readfirstlane_b32 s11, v0 -; GFX9-NEXT: s_mul_i32 s12, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11 -; GFX9-NEXT: s_mul_i32 s13, s9, s11 -; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s15, s8, s11 -; GFX9-NEXT: s_mul_i32 s14, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8 +; GFX9-NEXT: s_mul_i32 s13, s11, s8 +; GFX9-NEXT: s_add_i32 s9, s14, s9 +; GFX9-NEXT: s_add_i32 s9, s9, s13 +; GFX9-NEXT: s_mul_i32 s15, s10, s8 +; GFX9-NEXT: s_mul_i32 s14, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9 ; GFX9-NEXT: s_add_u32 s14, s16, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15 -; GFX9-NEXT: s_mul_i32 s15, s10, s15 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_mul_i32 s15, s12, s15 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9 ; GFX9-NEXT: s_addc_u32 s13, s13, s17 ; GFX9-NEXT: s_addc_u32 s14, s16, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s9, s12, s9 +; GFX9-NEXT: s_add_u32 s9, s13, s9 ; GFX9-NEXT: s_addc_u32 s13, 0, s14 -; GFX9-NEXT: s_add_u32 s11, s11, s12 -; GFX9-NEXT: s_addc_u32 s10, s10, s13 -; GFX9-NEXT: s_mul_i32 s12, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s9, s9, s11 -; GFX9-NEXT: s_add_i32 s12, s12, s9 -; GFX9-NEXT: s_mul_i32 s8, s8, s11 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8 -; GFX9-NEXT: s_mul_i32 s14, s10, s8 -; GFX9-NEXT: s_mul_i32 s16, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8 -; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12 -; GFX9-NEXT: s_add_u32 s8, s8, s16 +; GFX9-NEXT: s_add_u32 s14, s8, s9 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_addc_u32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s8, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14 +; GFX9-NEXT: s_add_i32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s11, s11, s14 +; GFX9-NEXT: s_add_i32 s8, s8, s11 +; GFX9-NEXT: s_mul_i32 s10, s10, s14 +; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s12, s10 +; GFX9-NEXT: s_mul_i32 s16, s14, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 +; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8 +; GFX9-NEXT: s_add_u32 s10, s10, s16 ; GFX9-NEXT: s_addc_u32 s15, 0, s15 -; GFX9-NEXT: s_add_u32 s8, s8, s14 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12 -; GFX9-NEXT: s_addc_u32 s8, s15, s13 +; GFX9-NEXT: s_add_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 +; GFX9-NEXT: s_addc_u32 s10, s15, s11 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s8, s8, s12 +; GFX9-NEXT: s_mul_i32 s8, s12, s8 +; GFX9-NEXT: s_add_u32 s8, s10, s8 +; GFX9-NEXT: s_addc_u32 s10, 0, s9 +; GFX9-NEXT: s_add_u32 s11, s14, s8 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_addc_u32 s8, s12, s10 +; GFX9-NEXT: s_mul_i32 s10, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11 +; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX9-NEXT: s_add_u32 s10, s12, s10 ; GFX9-NEXT: s_addc_u32 s9, 0, s9 -; GFX9-NEXT: s_add_u32 s8, s11, s8 -; GFX9-NEXT: s_addc_u32 s9, s10, s9 -; GFX9-NEXT: s_mul_i32 s11, s2, s9 -; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8 -; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 -; GFX9-NEXT: s_add_u32 s11, s12, s11 -; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8 -; GFX9-NEXT: s_mul_i32 s8, s3, s8 -; GFX9-NEXT: s_add_u32 s8, s11, s8 -; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9 -; GFX9-NEXT: s_addc_u32 s8, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11 +; GFX9-NEXT: s_mul_i32 s11, s3, s11 +; GFX9-NEXT: s_add_u32 s10, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8 +; GFX9-NEXT: s_addc_u32 s9, s9, s13 ; GFX9-NEXT: s_addc_u32 s10, s12, 0 -; GFX9-NEXT: s_mul_i32 s9, s3, s9 -; GFX9-NEXT: s_add_u32 s11, s8, s9 -; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_i32 s8, s6, s10 -; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11 +; GFX9-NEXT: s_mul_i32 s8, s3, s8 +; GFX9-NEXT: s_add_u32 s12, s9, s8 +; GFX9-NEXT: s_addc_u32 s13, 0, s10 +; GFX9-NEXT: s_mul_i32 s8, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12 ; GFX9-NEXT: s_add_i32 s8, s9, s8 -; GFX9-NEXT: s_mul_i32 s9, s7, s11 -; GFX9-NEXT: s_add_i32 s12, s8, s9 -; GFX9-NEXT: s_sub_i32 s13, s3, s12 -; GFX9-NEXT: s_mul_i32 s8, s6, s11 -; GFX9-NEXT: s_sub_u32 s14, s2, s8 +; GFX9-NEXT: s_mul_i32 s9, s7, s12 +; GFX9-NEXT: s_add_i32 s14, s8, s9 +; GFX9-NEXT: s_sub_i32 s10, s3, s14 +; GFX9-NEXT: s_mul_i32 s8, s6, s12 +; GFX9-NEXT: s_sub_u32 s15, s2, s8 ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX9-NEXT: s_subb_u32 s13, s13, s7 -; GFX9-NEXT: s_sub_u32 s15, s14, s6 -; GFX9-NEXT: s_subb_u32 s13, s13, 0 -; GFX9-NEXT: s_cmp_ge_u32 s13, s7 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_subb_u32 s16, s10, s7 +; GFX9-NEXT: s_sub_u32 s17, s15, s6 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s10, s16, 0 +; GFX9-NEXT: s_cmp_ge_u32 s10, s7 +; GFX9-NEXT: s_cselect_b32 s11, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s17, s6 ; GFX9-NEXT: s_cselect_b32 s16, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s15, s6 -; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s13, s7 -; GFX9-NEXT: s_cselect_b32 s13, s15, s16 -; GFX9-NEXT: s_add_u32 s15, s11, 1 -; GFX9-NEXT: s_addc_u32 s16, s10, 0 -; GFX9-NEXT: s_add_u32 s17, s11, 2 -; GFX9-NEXT: s_addc_u32 s18, s10, 0 -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b32 s13, s17, s15 -; GFX9-NEXT: s_cselect_b32 s15, s18, s16 +; GFX9-NEXT: s_cmp_eq_u32 s10, s7 +; GFX9-NEXT: s_cselect_b32 s10, s16, s11 +; GFX9-NEXT: s_add_u32 s11, s12, 1 +; GFX9-NEXT: s_addc_u32 s16, s13, 0 +; GFX9-NEXT: s_add_u32 s17, s12, 2 +; GFX9-NEXT: s_addc_u32 s18, s13, 0 +; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s10, s17, s11 +; GFX9-NEXT: s_cselect_b32 s11, s18, s16 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX9-NEXT: s_subb_u32 s3, s3, s12 +; GFX9-NEXT: s_subb_u32 s3, s3, s14 ; GFX9-NEXT: s_cmp_ge_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s8, -1, 0 -; GFX9-NEXT: s_cmp_ge_u32 s14, s6 +; GFX9-NEXT: s_cmp_ge_u32 s15, s6 ; GFX9-NEXT: s_cselect_b32 s9, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s3, s7 ; GFX9-NEXT: s_cselect_b32 s3, s9, s8 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s9, s15, s10 -; GFX9-NEXT: s_cselect_b32 s8, s13, s11 +; GFX9-NEXT: s_cselect_b32 s9, s11, s13 +; GFX9-NEXT: s_cselect_b32 s8, s10, s12 ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -2463,40 +2503,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_add_u32 s11, s12, s11 ; GFX1010-NEXT: s_addc_u32 s12, 0, s13 ; GFX1010-NEXT: s_add_u32 s8, s8, s11 +; GFX1010-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1010-NEXT: s_mul_i32 s11, s9, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s12 -; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1010-NEXT: s_mul_i32 s12, s9, s8 -; GFX1010-NEXT: s_mul_i32 s9, s9, s5 ; GFX1010-NEXT: s_mul_i32 s10, s10, s8 -; GFX1010-NEXT: s_add_i32 s9, s11, s9 -; GFX1010-NEXT: s_mul_i32 s11, s5, s12 +; GFX1010-NEXT: s_mul_i32 s9, s9, s5 +; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1010-NEXT: s_add_i32 s9, s13, s9 +; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11 ; GFX1010-NEXT: s_add_i32 s9, s9, s10 -; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1010-NEXT: s_mul_i32 s10, s5, s11 ; GFX1010-NEXT: s_mul_i32 s15, s8, s9 ; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1010-NEXT: s_add_u32 s10, s10, s15 -; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12 +; GFX1010-NEXT: s_add_u32 s12, s12, s15 ; GFX1010-NEXT: s_addc_u32 s14, 0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9 -; GFX1010-NEXT: s_add_u32 s10, s10, s11 +; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9 +; GFX1010-NEXT: s_add_u32 s10, s12, s10 ; GFX1010-NEXT: s_mul_i32 s9, s5, s9 ; GFX1010-NEXT: s_addc_u32 s10, s14, s13 -; GFX1010-NEXT: s_addc_u32 s11, s12, 0 +; GFX1010-NEXT: s_addc_u32 s11, s11, 0 ; GFX1010-NEXT: s_add_u32 s9, s10, s9 ; GFX1010-NEXT: s_addc_u32 s10, 0, s11 ; GFX1010-NEXT: s_add_u32 s8, s8, s9 +; GFX1010-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1010-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1010-NEXT: s_addc_u32 s5, s5, s10 -; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1010-NEXT: s_mul_i32 s12, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX1010-NEXT: s_mul_i32 s8, s3, s8 -; GFX1010-NEXT: s_add_u32 s9, s9, s12 -; GFX1010-NEXT: s_addc_u32 s11, 0, s11 +; GFX1010-NEXT: s_mul_i32 s12, s2, s5 +; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5 +; GFX1010-NEXT: s_add_u32 s11, s11, s12 +; GFX1010-NEXT: s_addc_u32 s10, 0, s10 ; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1010-NEXT: s_add_u32 s8, s9, s8 +; GFX1010-NEXT: s_add_u32 s8, s11, s8 ; GFX1010-NEXT: s_mul_i32 s5, s3, s5 -; GFX1010-NEXT: s_addc_u32 s8, s11, s10 +; GFX1010-NEXT: s_addc_u32 s8, s10, s9 ; GFX1010-NEXT: s_addc_u32 s9, s13, 0 ; GFX1010-NEXT: s_add_u32 s5, s8, s5 ; GFX1010-NEXT: s_addc_u32 s8, 0, s9 @@ -2509,8 +2553,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_sub_i32 s11, s3, s9 ; GFX1010-NEXT: s_sub_u32 s10, s2, s10 ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, s7 ; GFX1010-NEXT: s_sub_u32 s13, s10, s6 +; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1010-NEXT: s_subb_u32 s11, s11, 0 ; GFX1010-NEXT: s_cmp_ge_u32 s11, s7 ; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 @@ -2616,40 +2663,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_add_u32 s11, s12, s11 ; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s11 +; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8 -; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 ; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 -; GFX1030W32-NEXT: s_add_i32 s9, s11, s9 -; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12 +; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1030W32-NEXT: s_add_i32 s9, s13, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11 ; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9 ; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1030W32-NEXT: s_add_u32 s10, s10, s15 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12 +; GFX1030W32-NEXT: s_add_u32 s12, s12, s15 ; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9 -; GFX1030W32-NEXT: s_add_u32 s10, s10, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9 +; GFX1030W32-NEXT: s_add_u32 s10, s12, s10 ; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13 -; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0 +; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0 ; GFX1030W32-NEXT: s_add_u32 s9, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11 ; GFX1030W32-NEXT: s_add_u32 s8, s8, s9 +; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 -; GFX1030W32-NEXT: s_add_u32 s9, s9, s12 -; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11 +; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7 +; GFX1030W32-NEXT: s_add_u32 s11, s11, s12 +; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10 ; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX1030W32-NEXT: s_add_u32 s8, s9, s8 +; GFX1030W32-NEXT: s_add_u32 s8, s11, s8 ; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10 +; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0 ; GFX1030W32-NEXT: s_add_u32 s7, s8, s7 ; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9 @@ -2662,8 +2713,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9 ; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10 ; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5 ; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4 +; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0 ; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5 ; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 @@ -2736,8 +2790,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: ; %bb.1: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4 -; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5 +; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4 +; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5 ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2746,102 +2800,109 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1 -; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0 -; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7 -; GFX1030W64-NEXT: s_add_i32 s10, s12, s10 -; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7 -; GFX1030W64-NEXT: s_add_i32 s10, s10, s11 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13 -; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10 -; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13 -; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10 +; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1 +; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6 +; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6 +; GFX1030W64-NEXT: s_add_i32 s7, s12, s7 +; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6 +; GFX1030W64-NEXT: s_add_i32 s7, s7, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13 +; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13 +; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7 ; GFX1030W64-NEXT: s_add_u32 s12, s12, s15 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7 ; GFX1030W64-NEXT: s_add_u32 s11, s12, s11 -; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10 +; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7 ; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14 ; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 +; GFX1030W64-NEXT: s_add_u32 s7, s11, s7 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12 -; GFX1030W64-NEXT: s_add_u32 s7, s7, s10 -; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6 -; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7 -; GFX1030W64-NEXT: s_add_i32 s8, s10, s8 -; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11 -; GFX1030W64-NEXT: s_add_i32 s8, s8, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11 -; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8 -; GFX1030W64-NEXT: s_add_u32 s9, s9, s14 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11 +; GFX1030W64-NEXT: s_add_u32 s12, s6, s7 +; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12 +; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11 +; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12 +; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6 +; GFX1030W64-NEXT: s_add_i32 s9, s13, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6 +; GFX1030W64-NEXT: s_add_i32 s9, s9, s10 +; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6 +; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9 +; GFX1030W64-NEXT: s_add_u32 s7, s7, s14 ; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8 -; GFX1030W64-NEXT: s_add_u32 s9, s9, s10 -; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8 -; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12 -; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0 -; GFX1030W64-NEXT: s_add_u32 s8, s9, s8 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10 -; GFX1030W64-NEXT: s_add_u32 s7, s7, s8 -; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7 -; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9 +; GFX1030W64-NEXT: s_add_u32 s6, s7, s6 +; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9 +; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11 +; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0 +; GFX1030W64-NEXT: s_add_u32 s6, s6, s9 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s12, s6 +; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10 +; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9 +; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10 +; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7 +; GFX1030W64-NEXT: s_add_u32 s8, s10, s8 ; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W64-NEXT: s_add_u32 s8, s8, s11 -; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6 -; GFX1030W64-NEXT: s_add_u32 s7, s8, s7 -; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6 -; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9 +; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6 ; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s7, s6 +; GFX1030W64-NEXT: s_add_u32 s10, s6, s7 ; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8 ; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10 ; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11 ; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10 ; GFX1030W64-NEXT: s_add_i32 s6, s6, s7 -; GFX1030W64-NEXT: s_add_i32 s8, s6, s8 +; GFX1030W64-NEXT: s_add_i32 s12, s6, s8 ; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10 -; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8 -; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6 +; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12 +; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6 ; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5 -; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4 -; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5 +; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4 +; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5 +; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4 ; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 -; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5 -; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14 -; GFX1030W64-NEXT: s_add_u32 s13, s10, 1 +; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5 +; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9 +; GFX1030W64-NEXT: s_add_u32 s9, s10, 1 ; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0 ; GFX1030W64-NEXT: s_add_u32 s15, s10, 2 ; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0 -; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13 +; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9 ; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8 +; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12 ; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4 +; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 ; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5 ; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6 ; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11 -; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10 +; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10 ; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1030W64-NEXT: .LBB16_2: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2927,40 +2988,44 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_u32 s11, s12, s11 ; GFX11-NEXT: s_addc_u32 s12, 0, s13 ; GFX11-NEXT: s_add_u32 s8, s8, s11 +; GFX11-NEXT: s_cselect_b32 s11, -1, 0 +; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_mul_i32 s11, s9, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s12 -; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX11-NEXT: s_mul_i32 s12, s9, s8 -; GFX11-NEXT: s_mul_i32 s9, s9, s7 ; GFX11-NEXT: s_mul_i32 s10, s10, s8 -; GFX11-NEXT: s_add_i32 s9, s11, s9 -; GFX11-NEXT: s_mul_i32 s11, s7, s12 +; GFX11-NEXT: s_mul_i32 s9, s9, s7 +; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX11-NEXT: s_add_i32 s9, s13, s9 +; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11 ; GFX11-NEXT: s_add_i32 s9, s9, s10 -; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX11-NEXT: s_mul_i32 s10, s7, s11 ; GFX11-NEXT: s_mul_i32 s15, s8, s9 ; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX11-NEXT: s_add_u32 s10, s10, s15 -; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12 +; GFX11-NEXT: s_add_u32 s12, s12, s15 ; GFX11-NEXT: s_addc_u32 s14, 0, s14 -; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9 -; GFX11-NEXT: s_add_u32 s10, s10, s11 +; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9 +; GFX11-NEXT: s_add_u32 s10, s12, s10 ; GFX11-NEXT: s_mul_i32 s9, s7, s9 ; GFX11-NEXT: s_addc_u32 s10, s14, s13 -; GFX11-NEXT: s_addc_u32 s11, s12, 0 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 ; GFX11-NEXT: s_add_u32 s9, s10, s9 ; GFX11-NEXT: s_addc_u32 s10, 0, s11 ; GFX11-NEXT: s_add_u32 s8, s8, s9 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX11-NEXT: s_addc_u32 s7, s7, s10 -; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX11-NEXT: s_mul_i32 s12, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX11-NEXT: s_mul_i32 s8, s3, s8 -; GFX11-NEXT: s_add_u32 s9, s9, s12 -; GFX11-NEXT: s_addc_u32 s11, 0, s11 +; GFX11-NEXT: s_mul_i32 s12, s2, s7 +; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7 +; GFX11-NEXT: s_add_u32 s11, s11, s12 +; GFX11-NEXT: s_addc_u32 s10, 0, s10 ; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX11-NEXT: s_add_u32 s8, s9, s8 +; GFX11-NEXT: s_add_u32 s8, s11, s8 ; GFX11-NEXT: s_mul_i32 s7, s3, s7 -; GFX11-NEXT: s_addc_u32 s8, s11, s10 +; GFX11-NEXT: s_addc_u32 s8, s10, s9 ; GFX11-NEXT: s_addc_u32 s9, s13, 0 ; GFX11-NEXT: s_add_u32 s7, s8, s7 ; GFX11-NEXT: s_addc_u32 s8, 0, s9 @@ -2970,14 +3035,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_add_i32 s9, s9, s10 ; GFX11-NEXT: s_mul_i32 s10, s4, s7 ; GFX11-NEXT: s_add_i32 s9, s9, s11 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_sub_i32 s11, s3, s9 ; GFX11-NEXT: s_sub_u32 s10, s2, s10 ; GFX11-NEXT: s_cselect_b32 s12, -1, 0 +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, s5 ; GFX11-NEXT: s_sub_u32 s13, s10, s4 +; GFX11-NEXT: s_cselect_b32 s14, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s14, 0 ; GFX11-NEXT: s_subb_u32 s11, s11, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_ge_u32 s11, s5 ; GFX11-NEXT: s_cselect_b32 s14, -1, 0 ; GFX11-NEXT: s_cmp_ge_u32 s13, s4 @@ -3050,8 +3118,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000 +; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1250-NEXT: ; %bb.1: ; GFX1250-NEXT: s_cvt_f32_u32 s4, s6 @@ -3086,9 +3155,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s12 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11 ; GFX1250-NEXT: s_mul_i32 s12, s8, s11 ; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10 @@ -3103,17 +3175,19 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s10 -; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 +; GFX1250-NEXT: s_cselect_b32 s10, -1, 0 ; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 -; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8 -; GFX1250-NEXT: s_mul_i32 s12, s3, s8 +; GFX1250-NEXT: s_cmp_lg_u32 s10, 0 +; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8 +; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 +; GFX1250-NEXT: s_mul_i32 s11, s3, s8 ; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10 ; GFX1250-NEXT: s_mul_i32 s8, s2, s10 ; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10 ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9] ; GFX1250-NEXT: s_mul_i32 s10, s3, s10 -; GFX1250-NEXT: s_add_co_u32 s4, s8, s12 -; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11 +; GFX1250-NEXT: s_add_co_u32 s4, s8, s11 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12 ; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11] @@ -3128,8 +3202,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7 ; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6 +; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_cmp_ge_u32 s12, s7 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 ; GFX1250-NEXT: s_cmp_ge_u32 s13, s6 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 07e6a76..4b151b9 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -714,8 +714,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_lshl_b32 s2, s2, 8 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: s_flbit_i32_b32 s3, s3 ; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_flbit_i32_b32 s3, s3 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, s3, 32 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index fca57be..cefcbdd 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1491,6 +1491,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB14_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1520,6 +1521,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cbranch_scc0 .LBB14_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index dbdea8e..d8a5e7fa 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -14,6 +14,7 @@ define i32 @s_add_co_select_user() { ; GFX7-NEXT: s_add_u32 s7, s6, s6 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_or_b32 s4, s4, s5 +; GFX7-NEXT: s_cmp_lg_u32 s4, 0 ; GFX7-NEXT: s_addc_u32 s8, s6, 0 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -30,6 +31,8 @@ define i32 @s_add_co_select_user() { ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s7, s6, s6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_addc_u32 s8, s6, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec @@ -46,6 +49,8 @@ define i32 @s_add_co_select_user() { ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s5, s4, s4 +; GFX10-NEXT: s_cselect_b32 s6, -1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s6, 0 ; GFX10-NEXT: s_addc_u32 s6, s4, 0 ; GFX10-NEXT: s_cselect_b32 s7, -1, 0 ; GFX10-NEXT: s_and_b32 s7, s7, exec_lo @@ -62,13 +67,16 @@ define i32 @s_add_co_select_user() { ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s1, s0, s0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_addc_u32 s2, s0, 0 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s3, s3, exec_lo ; GFX11-NEXT: s_cselect_b32 s2, s2, 0 ; GFX11-NEXT: s_cmp_gt_u32 s0, 31 ; GFX11-NEXT: s_cselect_b32 s0, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -96,6 +104,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-NEXT: s_add_u32 s0, s2, s2 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_cmp_lg_u32 s0, 0 ; GFX7-NEXT: s_addc_u32 s0, s2, 0 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1] @@ -116,10 +125,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s1, s0, s0 -; GFX9-NEXT: s_addc_u32 s0, s0, 0 +; GFX9-NEXT: s_add_u32 s0, s2, s2 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_addc_u32 s0, s2, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-NEXT: s_cbranch_vccnz .LBB1_2 @@ -142,6 +153,8 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s1, s0, s0 +; GFX10-NEXT: s_cselect_b32 s1, -1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0 @@ -165,9 +178,11 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s1, s0, s0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_addc_u32 s0, s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 9a17538..62847b1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1117,6 +1117,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; SI: ; %bb.0: ; SI-NEXT: s_and_b32 s3, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s3, s0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: s_lshr_b32 s0, s1, 8 @@ -1168,6 +1169,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; VI: ; %bb.0: ; VI-NEXT: s_and_b32 s3, s1, 0x1ff ; VI-NEXT: s_or_b32 s0, s3, s0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: s_lshr_b32 s0, s1, 8 @@ -1215,6 +1217,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff ; GFX9-NEXT: s_or_b32 s0, s3, s0 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: s_lshr_b32 s0, s1, 8 @@ -1261,9 +1264,11 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 ; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8 @@ -1315,9 +1320,11 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 ; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8 @@ -4016,6 +4023,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_and_b32 s6, s4, 0xffe ; SI-NEXT: s_and_b32 s4, s1, 0x1ff ; SI-NEXT: s_or_b32 s0, s4, s0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s5 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] @@ -4058,6 +4066,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; SI-NEXT: s_and_b32 s5, s0, 0xffe ; SI-NEXT: s_and_b32 s0, s3, 0x1ff ; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SI-NEXT: v_readfirstlane_b32 s0, v2 @@ -4111,9 +4120,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_lshr_b32 s5, s3, 8 -; VI-NEXT: s_and_b32 s5, s5, 0xffe ; VI-NEXT: s_and_b32 s6, s3, 0x1ff +; VI-NEXT: s_and_b32 s5, s5, 0xffe ; VI-NEXT: s_or_b32 s2, s6, s2 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014 @@ -4153,6 +4163,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-NEXT: s_and_b32 s7, s2, 0xffe ; VI-NEXT: s_and_b32 s2, s1, 0x1ff ; VI-NEXT: s_or_b32 s0, s2, s0 +; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014 @@ -4198,9 +4209,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s5, s3, 8 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffe ; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffe ; GFX9-NEXT: s_or_b32 s2, s6, s2 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] ; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014 @@ -4242,6 +4254,7 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX9-NEXT: s_and_b32 s6, s2, 0xffe ; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff ; GFX9-NEXT: s_or_b32 s0, s2, s0 +; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -4288,10 +4301,11 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; ; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff -; GFX11-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-NEXT: s_or_b32 s2, s6, s2 +; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX11-NEXT: s_lshr_b32 s6, s3, 8 +; GFX11-NEXT: s_or_b32 s2, s5, s2 +; GFX11-NEXT: s_and_b32 s5, s6, 0xffe +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 @@ -4334,12 +4348,13 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-NEXT: s_cselect_b32 s2, s5, s6 ; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff +; GFX11-NEXT: s_or_b32 s0, s6, s0 ; GFX11-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_or_b32 s0, s6, s0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index c28b25c7..b0dd187 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -599,8 +599,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe ; SI-GISEL-NEXT: s_or_b32 s4, s7, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s6, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 ; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12 @@ -709,8 +711,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -820,8 +824,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -931,8 +937,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -1110,15 +1118,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1165,15 +1175,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1354,15 +1366,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -1409,15 +1423,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 @@ -2138,8 +2154,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe ; SI-GISEL-NEXT: s_or_b32 s4, s9, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9 ; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12 @@ -2175,10 +2193,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000 ; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10 ; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe +; SI-GISEL-NEXT: s_or_b32 s6, s9, s6 ; SI-GISEL-NEXT: s_or_b32 s3, s4, s3 -; SI-GISEL-NEXT: s_or_b32 s4, s9, s6 +; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0 ; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 ; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 ; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12 @@ -2335,8 +2355,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; VI-GISEL-NEXT: s_or_b32 s4, s8, s4 +; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s3, s3, s4 +; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2370,12 +2392,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_or_b32 s2, s3, s2 ; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8 +; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe -; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; VI-GISEL-NEXT: s_or_b32 s5, s5, s6 +; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s4, s4, s5 +; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -2531,8 +2555,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2566,12 +2592,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2 ; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8 +; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe -; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -2724,8 +2752,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe ; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2 ; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12 @@ -2759,12 +2789,14 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2 ; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8 +; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10 ; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe -; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff ; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3 ; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12 @@ -3041,15 +3073,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3081,17 +3115,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 ; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 -; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3140,15 +3176,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3180,17 +3218,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 ; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 -; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3471,15 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3511,17 +3553,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 +; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff +; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 ; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 -; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 +; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 @@ -3570,15 +3614,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 @@ -3610,17 +3656,19 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 +; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff ; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 ; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff +; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 ; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10 ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 -; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 +; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 ; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4 ; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 0deef8b..5d31177 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -182,6 +182,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; SI-NEXT: s_and_b32 s1, s7, 0x1ff ; SI-NEXT: s_and_b32 s8, s0, 0xffe ; SI-NEXT: s_or_b32 s0, s1, s6 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 @@ -236,6 +237,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe ; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff ; VI-SDAG-NEXT: s_or_b32 s4, s4, s6 +; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0 ; VI-SDAG-NEXT: s_mov_b32 s1, s5 ; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] @@ -288,8 +290,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe ; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 @@ -331,10 +335,11 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8 -; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe -; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2 +; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff +; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8 +; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2 +; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe +; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 @@ -382,14 +387,16 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 ; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4 ; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 @@ -431,10 +438,11 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8 -; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe -; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2 +; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff +; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2 +; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 @@ -490,15 +498,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 ; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10 ; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4 ; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 31f277f..37756d1 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -472,6 +472,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -535,10 +536,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -604,6 +606,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -657,11 +660,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -706,8 +710,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1685,6 +1690,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -1748,10 +1754,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1817,6 +1824,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -1870,11 +1878,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1919,8 +1928,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2958,6 +2968,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3021,10 +3032,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3090,6 +3102,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3143,11 +3156,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3192,8 +3206,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3727,6 +3742,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3790,10 +3806,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3859,6 +3876,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3912,11 +3930,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3961,8 +3980,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4999,6 +5019,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -5062,10 +5083,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5131,6 +5153,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5184,11 +5207,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5246,8 +5270,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6259,6 +6284,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6328,6 +6354,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6397,6 +6424,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6457,6 +6485,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6520,6 +6550,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7686,6 +7717,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7755,6 +7787,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7824,6 +7857,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7884,6 +7918,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7947,6 +7983,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9113,6 +9150,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9182,6 +9220,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9251,6 +9290,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9311,6 +9351,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9374,6 +9416,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10022,6 +10065,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10091,6 +10135,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10160,6 +10205,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10220,6 +10266,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10283,6 +10331,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11449,6 +11498,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11518,6 +11568,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11587,6 +11638,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11647,6 +11699,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11710,6 +11764,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 4581efc..6351bb3 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -381,12 +381,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -456,6 +457,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -511,6 +513,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 @@ -559,7 +562,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -606,9 +610,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1414,12 +1420,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1489,6 +1496,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -1544,6 +1552,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1592,7 +1601,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1639,9 +1649,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2447,12 +2459,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2522,6 +2535,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -2577,6 +2591,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 @@ -2625,7 +2640,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2672,9 +2688,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3573,6 +3591,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3646,6 +3665,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -3704,6 +3724,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3753,7 +3774,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3819,9 +3841,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4836,6 +4859,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4909,6 +4933,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -4967,6 +4992,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5016,7 +5042,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5082,9 +5109,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6099,6 +6127,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -6172,6 +6201,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -6230,6 +6260,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -6279,7 +6310,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -6345,9 +6377,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index bd570d9..a9ac008 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -381,12 +381,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -456,6 +457,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -511,6 +513,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 @@ -559,7 +562,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -606,9 +610,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1414,12 +1420,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1489,6 +1496,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -1544,6 +1552,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1592,7 +1601,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -1639,9 +1649,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -2447,12 +2459,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX9-NEXT: v_readlane_b32 s3, v0, s2 -; GFX9-NEXT: v_max_f32_e64 v1, s3, s3 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX9-NEXT: v_max_f32_e64 v2, s4, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: v_min_f32_e32 v2, v1, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2522,6 +2535,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -2577,6 +2591,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2 ; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 @@ -2625,7 +2640,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -2672,9 +2688,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3573,6 +3591,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3646,6 +3665,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -3704,6 +3724,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3753,7 +3774,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -3819,9 +3841,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4836,6 +4859,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4909,6 +4933,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -4967,6 +4992,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5016,7 +5042,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -5082,9 +5109,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6099,6 +6127,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -6172,6 +6201,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd @@ -6230,6 +6260,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -6279,7 +6310,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd @@ -6345,9 +6377,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 1f2d70c..6311143 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -532,6 +532,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -595,10 +596,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -664,6 +666,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -717,11 +720,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -779,8 +783,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -1857,6 +1862,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -1920,10 +1926,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1989,6 +1996,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -2042,11 +2050,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2104,8 +2113,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -3182,6 +3192,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -3245,10 +3256,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3314,6 +3326,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -3367,11 +3380,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3429,8 +3443,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -4003,6 +4018,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -4066,10 +4082,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4135,6 +4152,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -4188,11 +4206,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4250,8 +4269,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -5327,6 +5347,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: v_readlane_b32 s4, v0, s2 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd @@ -5390,10 +5411,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1] -; GFX1064-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5459,6 +5481,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd @@ -5512,11 +5535,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164-NEXT: v_readlane_b32 s3, v0, s2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, s2 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5574,8 +5598,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd @@ -6587,6 +6612,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6656,6 +6682,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6725,6 +6752,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6785,6 +6813,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6848,6 +6878,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8013,6 +8044,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8082,6 +8114,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8151,6 +8184,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8211,6 +8245,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8274,6 +8310,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9440,6 +9477,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9509,6 +9547,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9578,6 +9617,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9638,6 +9678,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9701,6 +9743,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10349,6 +10392,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10418,6 +10462,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10487,6 +10532,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10547,6 +10593,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -10610,6 +10658,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11775,6 +11824,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11844,6 +11894,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11913,6 +11964,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -11973,6 +12025,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -12036,6 +12090,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3] +; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index c3f3917..eee232a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -136,17 +136,19 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_cselect_b32 s13, -1, 0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-NEXT: s_and_b32 s13, s8, s13 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s13, s13, exec_lo +; GFX11-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: s_and_b32 s1, s8, s1 +; GFX11-NEXT: s_and_b32 s1, s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s1, s19, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s1, s1, 1 +; GFX11-NEXT: s_cselect_b32 s1, s19, s13 ; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s1, s1, 1 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 ; GFX11-NEXT: s_cselect_b32 s13, -1, 0 ; GFX11-NEXT: s_and_b32 s20, s9, exec_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 6dc9199..8748aff 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -8265,10 +8265,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s3 +; GFX12-NEXT: s_lshl_b32 s7, 1, s3 ; GFX12-NEXT: v_writelane_b32 v0, s0, s3 -; GFX12-NEXT: s_lshl_b32 s3, 1, s3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 s1, s1, s3 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd @@ -8349,13 +8351,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX942-NEXT: .LBB28_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: v_readfirstlane_b32 s6, v1 -; GFX942-NEXT: s_mov_b32 m0, s3 -; GFX942-NEXT: v_readlane_b32 s8, v2, s3 -; GFX942-NEXT: v_writelane_b32 v0, s6, m0 ; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX942-NEXT: v_readfirstlane_b32 s8, v1 +; GFX942-NEXT: v_readlane_b32 s9, v2, s3 +; GFX942-NEXT: s_mov_b32 m0, s3 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX942-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX942-NEXT: v_writelane_b32 v0, s8, m0 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX942-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX942-NEXT: ; %bb.6: ; %ComputeEnd ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8437,14 +8440,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: .LBB28_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 +; GFX11-NEXT: s_lshl_b32 s7, 1, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_lshl_b32 s1, 1, s1 -; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8524,10 +8528,11 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 +; GFX10-NEXT: s_lshl_b32 s7, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s7 ; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_lshl_b32 s1, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s1 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8604,13 +8609,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX90A-NEXT: v_readfirstlane_b32 s6, v1 -; GFX90A-NEXT: s_mov_b32 m0, s3 -; GFX90A-NEXT: v_readlane_b32 s8, v2, s3 -; GFX90A-NEXT: v_writelane_b32 v0, s6, m0 ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 +; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: s_mov_b32 m0, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8686,13 +8692,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: .LBB28_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX908-NEXT: v_readfirstlane_b32 s6, v1 -; GFX908-NEXT: s_mov_b32 m0, s3 -; GFX908-NEXT: v_readlane_b32 s8, v2, s3 -; GFX908-NEXT: v_writelane_b32 v0, s6, m0 ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX908-NEXT: v_readfirstlane_b32 s8, v1 +; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: s_mov_b32 m0, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX908-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX908-NEXT: v_writelane_b32 v0, s8, m0 +; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8769,13 +8776,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: .LBB28_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: v_readfirstlane_b32 s6, v1 -; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v2, s3 -; GFX8-NEXT: v_writelane_b32 v0, s6, m0 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s8, v1 +; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: s_mov_b32 m0, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX8-NEXT: v_writelane_b32 v0, s8, m0 +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9122,10 +9130,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s3 +; GFX12-NEXT: s_lshl_b32 s7, 1, s3 ; GFX12-NEXT: v_writelane_b32 v0, s0, s3 -; GFX12-NEXT: s_lshl_b32 s3, 1, s3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 s1, s1, s3 +; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12-NEXT: s_add_f32 s0, s0, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd @@ -9202,13 +9212,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX942-NEXT: .LBB29_5: ; %ComputeLoop ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: v_readfirstlane_b32 s6, v1 -; GFX942-NEXT: s_mov_b32 m0, s3 -; GFX942-NEXT: v_readlane_b32 s8, v2, s3 -; GFX942-NEXT: v_writelane_b32 v0, s6, m0 ; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX942-NEXT: v_readfirstlane_b32 s8, v1 +; GFX942-NEXT: v_readlane_b32 s9, v2, s3 +; GFX942-NEXT: s_mov_b32 m0, s3 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX942-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX942-NEXT: v_writelane_b32 v0, s8, m0 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX942-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX942-NEXT: ; %bb.6: ; %ComputeEnd ; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9285,14 +9296,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: .LBB29_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 +; GFX11-NEXT: s_lshl_b32 s7, 1, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 ; GFX11-NEXT: v_writelane_b32 v0, s3, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_lshl_b32 s1, 1, s1 -; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9365,10 +9377,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 +; GFX10-NEXT: s_lshl_b32 s7, 1, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s7 ; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_lshl_b32 s1, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s1 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9440,13 +9453,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX90A-NEXT: v_readfirstlane_b32 s6, v1 -; GFX90A-NEXT: s_mov_b32 m0, s3 -; GFX90A-NEXT: v_readlane_b32 s8, v2, s3 -; GFX90A-NEXT: v_writelane_b32 v0, s6, m0 ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 +; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: s_mov_b32 m0, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9519,13 +9533,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: .LBB29_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX908-NEXT: v_readfirstlane_b32 s6, v1 -; GFX908-NEXT: s_mov_b32 m0, s3 -; GFX908-NEXT: v_readlane_b32 s8, v2, s3 -; GFX908-NEXT: v_writelane_b32 v0, s6, m0 ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX908-NEXT: v_readfirstlane_b32 s8, v1 +; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: s_mov_b32 m0, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX908-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX908-NEXT: v_writelane_b32 v0, s8, m0 +; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9599,13 +9614,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: .LBB29_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX8-NEXT: v_readfirstlane_b32 s6, v1 -; GFX8-NEXT: s_mov_b32 m0, s3 -; GFX8-NEXT: v_readlane_b32 s8, v2, s3 -; GFX8-NEXT: v_writelane_b32 v0, s6, m0 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s8, v1 +; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: s_mov_b32 m0, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: v_add_f32_e32 v1, s8, v1 +; GFX8-NEXT: v_writelane_b32 v0, s8, m0 +; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index fba42c4..c1cf06e 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -388,8 +388,9 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc - ; GCN-NEXT: S_NOP 0, implicit $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: S_NOP 0, implicit killed $scc + ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} @@ -416,80 +417,6 @@ body: | S_ENDPGM 0 ... ---- -name: xor_1_cmp_lg_0_killed_scc -body: | - ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc - ; GCN-NEXT: S_NOP 0, implicit $scc - ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc - ; GCN-NEXT: S_BRANCH %bb.1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: S_ENDPGM 0 - bb.0: - successors: %bb.1(0x40000000), %bb.2(0x40000000) - liveins: $sgpr0, $vgpr0_vgpr1 - - %0:sreg_32 = COPY $sgpr0 - %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc - S_NOP 0, implicit killed $scc - S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc - S_CBRANCH_SCC0 %bb.2, implicit $scc - S_BRANCH %bb.1 - - bb.1: - successors: %bb.2(0x80000000) - - bb.2: - S_ENDPGM 0 - -... ---- -name: absdiff_1_cmp_lg_0_killed_scc -body: | - ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc - ; GCN-NEXT: S_NOP 0, implicit $scc - ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc - ; GCN-NEXT: S_BRANCH %bb.1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: S_ENDPGM 0 - bb.0: - successors: %bb.1(0x40000000), %bb.2(0x40000000) - liveins: $sgpr0, $vgpr0_vgpr1 - - %0:sreg_32 = COPY $sgpr0 - %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc - S_NOP 0, implicit killed $scc - S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc - S_CBRANCH_SCC0 %bb.2, implicit $scc - S_BRANCH %bb.1 - - bb.1: - successors: %bb.2(0x80000000) - - bb.2: - S_ENDPGM 0 - -... --- name: and_1_cmp_eq_1_clobbered_scc @@ -2143,7 +2070,8 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc + ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 19cc7f7..f53aaaa 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -10,6 +10,7 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: shl32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -24,6 +25,7 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: shl64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -38,6 +40,7 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: lshr32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -52,6 +55,7 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: lshr64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -66,6 +70,7 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: ashr32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_ashr_i32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -80,6 +85,7 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: ashr64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -94,6 +100,7 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) { ; CHECK-LABEL: abs32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_abs_i32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -114,6 +121,7 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: and32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -128,6 +136,7 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: and64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -142,6 +151,7 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: or32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -156,6 +166,7 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: or64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -170,6 +181,7 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: xor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xor_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -184,6 +196,7 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: xor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -198,6 +211,7 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: nand32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nand_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -217,6 +231,7 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nand64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -236,6 +251,7 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: nor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nor_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -255,6 +271,7 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -274,6 +291,7 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: xnor32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xnor_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -293,6 +311,7 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: xnor64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -312,6 +331,7 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: andn232: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_andn2_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -327,6 +347,7 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: nandn264: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -342,6 +363,7 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: orn232: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_orn2_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -357,6 +379,7 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: orn264: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -372,6 +395,7 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) { ; CHECK-LABEL: bfe_i32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -409,6 +433,7 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) { ; CHECK-LABEL: bfe_u32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -488,6 +513,7 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) { ; CHECK-LABEL: bcnt132: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -526,6 +552,7 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) { ; CHECK-LABEL: quadmask32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_quadmask_b32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -544,6 +571,7 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) { ; CHECK-LABEL: quadmask64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -562,6 +590,7 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) { ; CHECK-LABEL: not32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_not_b32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s0 ; CHECK-NEXT: ;;#ASMEND @@ -580,6 +609,7 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) { ; CHECK-LABEL: not64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_not_b64 s[0:1], s[0:1] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll index 7552f6b..a828ee0 100644 --- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll +++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll @@ -12,6 +12,8 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) { ; CHECK-LABEL: s_uaddo_pseudo: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_addc_u32 s0, 1, 0 ; CHECK-NEXT: ; return to shader part epilog %pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1) @@ -30,6 +32,8 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: s_usubo_pseudo: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_sub_u32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 ; CHECK-NEXT: s_subb_u32 s0, s1, 0 ; CHECK-NEXT: ; return to shader part epilog %pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 71f5a94..5f6d622 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -56,9 +56,10 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_addc_u32 s15, 0, s16 ; GCN-NEXT: s_add_u32 s16, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s14, s14, s15 ; GCN-NEXT: s_mul_i32 s0, s12, s14 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -89,6 +90,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_add_u32 s15, s16, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s14, s14, s12 ; GCN-NEXT: s_ashr_i32 s12, s7, 31 ; GCN-NEXT: s_add_u32 s0, s6, s12 @@ -114,50 +116,52 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_addc_u32 s4, s4, 0 ; GCN-NEXT: s_mul_i32 s14, s7, s14 -; GCN-NEXT: s_add_u32 s16, s1, s14 -; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: s_add_u32 s14, s1, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 -; GCN-NEXT: s_addc_u32 s17, 0, s4 +; GCN-NEXT: s_addc_u32 s15, 0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mul_i32 s4, s10, s17 +; GCN-NEXT: s_mul_i32 s4, s10, s15 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: s_mul_i32 s5, s11, s16 -; GCN-NEXT: s_add_i32 s18, s4, s5 -; GCN-NEXT: s_sub_i32 s14, s7, s18 -; GCN-NEXT: s_mul_i32 s4, s10, s16 +; GCN-NEXT: s_mul_i32 s5, s11, s14 +; GCN-NEXT: s_add_i32 s16, s4, s5 +; GCN-NEXT: s_sub_i32 s17, s7, s16 +; GCN-NEXT: s_mul_i32 s4, s10, s14 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s15, s4, s5 -; GCN-NEXT: s_subb_u32 s19, s14, s11 -; GCN-NEXT: s_sub_u32 s20, s6, s10 -; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GCN-NEXT: s_or_b32 s14, s14, s15 -; GCN-NEXT: s_subb_u32 s14, s19, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s11 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s20, s10 -; GCN-NEXT: s_cselect_b32 s19, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s11 -; GCN-NEXT: s_cselect_b32 s14, s19, s15 -; GCN-NEXT: s_add_u32 s15, s16, 1 -; GCN-NEXT: s_addc_u32 s19, s17, 0 -; GCN-NEXT: s_add_u32 s20, s16, 2 -; GCN-NEXT: s_addc_u32 s21, s17, 0 -; GCN-NEXT: s_cmp_lg_u32 s14, 0 -; GCN-NEXT: s_cselect_b32 s14, s20, s15 -; GCN-NEXT: s_cselect_b32 s15, s21, s19 +; GCN-NEXT: s_or_b32 s18, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s18, 0 +; GCN-NEXT: s_subb_u32 s17, s17, s11 +; GCN-NEXT: s_sub_u32 s19, s6, s10 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_subb_u32 s4, s7, s18 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s4, s17, 0 ; GCN-NEXT: s_cmp_ge_u32 s4, s11 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s6, s10 -; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s19, s10 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s4, s11 -; GCN-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-NEXT: s_cselect_b32 s4, s17, s5 +; GCN-NEXT: s_add_u32 s5, s14, 1 +; GCN-NEXT: s_addc_u32 s17, s15, 0 +; GCN-NEXT: s_add_u32 s19, s14, 2 +; GCN-NEXT: s_addc_u32 s20, s15, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s5, s15, s17 -; GCN-NEXT: s_cselect_b32 s4, s14, s16 +; GCN-NEXT: s_cselect_b32 s4, s19, s5 +; GCN-NEXT: s_cselect_b32 s5, s20, s17 +; GCN-NEXT: s_cmp_lg_u32 s18, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s16 +; GCN-NEXT: s_cmp_ge_u32 s7, s11 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s6, s10 +; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s7, s11 +; GCN-NEXT: s_cselect_b32 s6, s6, s16 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s5, s5, s15 +; GCN-NEXT: s_cselect_b32 s4, s4, s14 ; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_sub_u32 s4, s4, s6 @@ -204,6 +208,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s18, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s10, s10, s11 +; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0 ; GCN-IR-NEXT: s_addc_u32 s10, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 @@ -237,6 +242,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_or_b32 s20, s20, s21 +; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9] @@ -1189,9 +1195,10 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s12, 0, s13 ; GCN-NEXT: s_add_u32 s13, s8, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s11, s11, s12 ; GCN-NEXT: s_mul_i32 s8, s2, s11 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 @@ -1222,6 +1229,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s2, s13, s2 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s8, s11, s10 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s8, 24 @@ -1230,46 +1238,48 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 ; GCN-NEXT: s_add_u32 s8, s10, s8 -; GCN-NEXT: s_addc_u32 s12, 0, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: s_addc_u32 s10, 0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: s_mul_i32 s8, s7, s12 +; GCN-NEXT: s_mul_i32 s8, s7, s10 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 -; GCN-NEXT: s_add_i32 s13, s9, s8 -; GCN-NEXT: s_sub_i32 s10, 0, s13 -; GCN-NEXT: s_mul_i32 s8, s6, s12 -; GCN-NEXT: s_sub_u32 s14, 24, s8 +; GCN-NEXT: s_add_i32 s11, s9, s8 +; GCN-NEXT: s_sub_i32 s12, 0, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s10 +; GCN-NEXT: s_sub_u32 s13, 24, s8 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s14, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s12, s12, s7 +; GCN-NEXT: s_sub_u32 s15, s13, s6 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s11, s8, s9 -; GCN-NEXT: s_subb_u32 s15, s10, s7 -; GCN-NEXT: s_sub_u32 s16, s14, s6 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s15, 0 -; GCN-NEXT: s_cmp_ge_u32 s10, s7 -; GCN-NEXT: s_cselect_b32 s11, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s16, s6 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s10, s7 -; GCN-NEXT: s_cselect_b32 s10, s15, s11 -; GCN-NEXT: s_add_u32 s11, s12, 1 -; GCN-NEXT: s_addc_u32 s15, 0, 0 -; GCN-NEXT: s_add_u32 s16, s12, 2 -; GCN-NEXT: s_addc_u32 s17, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s10, 0 -; GCN-NEXT: s_cselect_b32 s10, s16, s11 -; GCN-NEXT: s_cselect_b32 s11, s17, s15 ; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, 0, s13 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_subb_u32 s8, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s8, s7 ; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s6 -; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s6 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s8, s7 -; GCN-NEXT: s_cselect_b32 s6, s6, s9 +; GCN-NEXT: s_cselect_b32 s8, s12, s9 +; GCN-NEXT: s_add_u32 s9, s10, 1 +; GCN-NEXT: s_addc_u32 s12, 0, 0 +; GCN-NEXT: s_add_u32 s15, s10, 2 +; GCN-NEXT: s_addc_u32 s16, 0, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s8, s15, s9 +; GCN-NEXT: s_cselect_b32 s9, s16, s12 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s11, 0, s11 +; GCN-NEXT: s_cmp_ge_u32 s11, s7 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s6 +; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s11, s7 +; GCN-NEXT: s_cselect_b32 s6, s6, s12 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s7, s11, 0 -; GCN-NEXT: s_cselect_b32 s6, s10, s12 +; GCN-NEXT: s_cselect_b32 s7, s9, 0 +; GCN-NEXT: s_cselect_b32 s6, s8, s10 ; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_subb_u32 s7, s7, s4 @@ -1305,6 +1315,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 +; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 @@ -1337,6 +1348,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 +; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index e12e31b..bbd1793 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GCN-NEXT: s_sub_u32 s3, 0, s8 -; GCN-NEXT: s_subb_u32 s10, 0, s9 +; GCN-NEXT: s_subb_u32 s12, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1522,52 +1522,56 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-NEXT: s_mul_i32 s13, s3, s11 -; GCN-NEXT: s_mul_hi_u32 s15, s3, s12 -; GCN-NEXT: s_mul_i32 s14, s10, s12 -; GCN-NEXT: s_add_i32 s13, s15, s13 -; GCN-NEXT: s_add_i32 s13, s13, s14 -; GCN-NEXT: s_mul_i32 s16, s3, s12 -; GCN-NEXT: s_mul_i32 s15, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s17, s12, s16 -; GCN-NEXT: s_mul_hi_u32 s14, s12, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_mul_i32 s11, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s15, s3, s10 +; GCN-NEXT: s_mul_i32 s14, s12, s10 +; GCN-NEXT: s_add_i32 s11, s15, s11 +; GCN-NEXT: s_add_i32 s11, s11, s14 +; GCN-NEXT: s_mul_i32 s16, s3, s10 +; GCN-NEXT: s_mul_i32 s15, s10, s11 +; GCN-NEXT: s_mul_hi_u32 s17, s10, s16 +; GCN-NEXT: s_mul_hi_u32 s14, s10, s11 ; GCN-NEXT: s_add_u32 s15, s17, s15 ; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_mul_hi_u32 s18, s11, s16 -; GCN-NEXT: s_mul_i32 s16, s11, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s13, s16 +; GCN-NEXT: s_mul_i32 s16, s13, s16 ; GCN-NEXT: s_add_u32 s15, s15, s16 -; GCN-NEXT: s_mul_hi_u32 s17, s11, s13 +; GCN-NEXT: s_mul_hi_u32 s17, s13, s11 ; GCN-NEXT: s_addc_u32 s14, s14, s18 ; GCN-NEXT: s_addc_u32 s15, s17, 0 -; GCN-NEXT: s_mul_i32 s13, s11, s13 -; GCN-NEXT: s_add_u32 s13, s14, s13 +; GCN-NEXT: s_mul_i32 s11, s13, s11 +; GCN-NEXT: s_add_u32 s11, s14, s11 ; GCN-NEXT: s_addc_u32 s14, 0, s15 -; GCN-NEXT: s_add_u32 s12, s12, s13 -; GCN-NEXT: s_addc_u32 s11, s11, s14 -; GCN-NEXT: s_mul_i32 s13, s3, s11 -; GCN-NEXT: s_mul_hi_u32 s14, s3, s12 -; GCN-NEXT: s_add_i32 s13, s14, s13 -; GCN-NEXT: s_mul_i32 s10, s10, s12 -; GCN-NEXT: s_add_i32 s13, s13, s10 -; GCN-NEXT: s_mul_i32 s3, s3, s12 -; GCN-NEXT: s_mul_hi_u32 s14, s11, s3 -; GCN-NEXT: s_mul_i32 s15, s11, s3 -; GCN-NEXT: s_mul_i32 s17, s12, s13 -; GCN-NEXT: s_mul_hi_u32 s3, s12, s3 -; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: s_add_u32 s15, s10, s11 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GCN-NEXT: s_addc_u32 s13, s13, s14 +; GCN-NEXT: s_mul_i32 s10, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s11, s3, s15 +; GCN-NEXT: s_add_i32 s10, s11, s10 +; GCN-NEXT: s_mul_i32 s12, s12, s15 +; GCN-NEXT: s_add_i32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s3, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s12, s13, s3 +; GCN-NEXT: s_mul_i32 s14, s13, s3 +; GCN-NEXT: s_mul_i32 s17, s15, s10 +; GCN-NEXT: s_mul_hi_u32 s3, s15, s3 +; GCN-NEXT: s_mul_hi_u32 s16, s15, s10 ; GCN-NEXT: s_add_u32 s3, s3, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_mul_hi_u32 s10, s11, s13 -; GCN-NEXT: s_addc_u32 s3, s16, s14 -; GCN-NEXT: s_addc_u32 s10, s10, 0 -; GCN-NEXT: s_mul_i32 s13, s11, s13 -; GCN-NEXT: s_add_u32 s3, s3, s13 -; GCN-NEXT: s_addc_u32 s10, 0, s10 -; GCN-NEXT: s_add_u32 s3, s12, s3 -; GCN-NEXT: s_addc_u32 s14, s11, s10 +; GCN-NEXT: s_add_u32 s3, s3, s14 +; GCN-NEXT: s_mul_hi_u32 s11, s13, s10 +; GCN-NEXT: s_addc_u32 s3, s16, s12 +; GCN-NEXT: s_addc_u32 s11, s11, 0 +; GCN-NEXT: s_mul_i32 s10, s13, s10 +; GCN-NEXT: s_add_u32 s3, s3, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s11 +; GCN-NEXT: s_add_u32 s3, s15, s3 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GCN-NEXT: s_addc_u32 s14, s13, s12 ; GCN-NEXT: s_ashr_i32 s10, s5, 31 ; GCN-NEXT: s_add_u32 s12, s4, s10 ; GCN-NEXT: s_mov_b32 s11, s10 @@ -1596,9 +1600,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_mul_i32 s3, s8, s3 ; GCN-NEXT: s_sub_u32 s3, s12, s3 ; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s12, s16, s9 ; GCN-NEXT: s_sub_u32 s18, s3, s8 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s19, s12, 0 ; GCN-NEXT: s_cmp_ge_u32 s19, s9 ; GCN-NEXT: s_cselect_b32 s20, -1, 0 @@ -1608,10 +1614,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_cselect_b32 s20, s21, s20 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s12, s12, s9 -; GCN-NEXT: s_sub_u32 s16, s18, s8 +; GCN-NEXT: s_sub_u32 s21, s18, s8 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s12, s12, 0 ; GCN-NEXT: s_cmp_lg_u32 s20, 0 -; GCN-NEXT: s_cselect_b32 s16, s16, s18 +; GCN-NEXT: s_cselect_b32 s16, s21, s18 ; GCN-NEXT: s_cselect_b32 s12, s12, s19 ; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 ; GCN-NEXT: s_subb_u32 s5, s13, s5 @@ -1923,9 +1931,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s3, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -1935,10 +1945,12 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, s7 -; TONGA-NEXT: s_sub_u32 s16, s18, s6 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s3, s3, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s16, s18 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 ; TONGA-NEXT: s_cselect_b32 s3, s3, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s5, s13, s5 @@ -2718,7 +2730,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s9, 0, s6 -; GCN-NEXT: s_subb_u32 s14, 0, s7 +; GCN-NEXT: s_subb_u32 s16, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2727,52 +2739,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s15, v1 -; GCN-NEXT: v_readfirstlane_b32 s16, v0 -; GCN-NEXT: s_mul_i32 s17, s9, s15 -; GCN-NEXT: s_mul_hi_u32 s19, s9, s16 -; GCN-NEXT: s_mul_i32 s18, s14, s16 -; GCN-NEXT: s_add_i32 s17, s19, s17 -; GCN-NEXT: s_add_i32 s17, s17, s18 -; GCN-NEXT: s_mul_i32 s20, s9, s16 -; GCN-NEXT: s_mul_i32 s19, s16, s17 -; GCN-NEXT: s_mul_hi_u32 s21, s16, s20 -; GCN-NEXT: s_mul_hi_u32 s18, s16, s17 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s15, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s19, s9, s14 +; GCN-NEXT: s_mul_i32 s18, s16, s14 +; GCN-NEXT: s_add_i32 s15, s19, s15 +; GCN-NEXT: s_add_i32 s15, s15, s18 +; GCN-NEXT: s_mul_i32 s20, s9, s14 +; GCN-NEXT: s_mul_i32 s19, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s21, s14, s20 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 ; GCN-NEXT: s_add_u32 s19, s21, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_mul_hi_u32 s22, s15, s20 -; GCN-NEXT: s_mul_i32 s20, s15, s20 +; GCN-NEXT: s_mul_hi_u32 s22, s17, s20 +; GCN-NEXT: s_mul_i32 s20, s17, s20 ; GCN-NEXT: s_add_u32 s19, s19, s20 -; GCN-NEXT: s_mul_hi_u32 s21, s15, s17 +; GCN-NEXT: s_mul_hi_u32 s21, s17, s15 ; GCN-NEXT: s_addc_u32 s18, s18, s22 ; GCN-NEXT: s_addc_u32 s19, s21, 0 -; GCN-NEXT: s_mul_i32 s17, s15, s17 -; GCN-NEXT: s_add_u32 s17, s18, s17 +; GCN-NEXT: s_mul_i32 s15, s17, s15 +; GCN-NEXT: s_add_u32 s15, s18, s15 ; GCN-NEXT: s_addc_u32 s18, 0, s19 -; GCN-NEXT: s_add_u32 s16, s16, s17 -; GCN-NEXT: s_addc_u32 s15, s15, s18 -; GCN-NEXT: s_mul_i32 s17, s9, s15 -; GCN-NEXT: s_mul_hi_u32 s18, s9, s16 -; GCN-NEXT: s_add_i32 s17, s18, s17 -; GCN-NEXT: s_mul_i32 s14, s14, s16 -; GCN-NEXT: s_add_i32 s17, s17, s14 -; GCN-NEXT: s_mul_i32 s9, s9, s16 -; GCN-NEXT: s_mul_hi_u32 s18, s15, s9 -; GCN-NEXT: s_mul_i32 s19, s15, s9 -; GCN-NEXT: s_mul_i32 s21, s16, s17 -; GCN-NEXT: s_mul_hi_u32 s9, s16, s9 -; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 +; GCN-NEXT: s_add_u32 s19, s14, s15 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GCN-NEXT: s_addc_u32 s17, s17, s18 +; GCN-NEXT: s_mul_i32 s14, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s15, s9, s19 +; GCN-NEXT: s_add_i32 s14, s15, s14 +; GCN-NEXT: s_mul_i32 s16, s16, s19 +; GCN-NEXT: s_add_i32 s14, s14, s16 +; GCN-NEXT: s_mul_i32 s9, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s16, s17, s9 +; GCN-NEXT: s_mul_i32 s18, s17, s9 +; GCN-NEXT: s_mul_i32 s21, s19, s14 +; GCN-NEXT: s_mul_hi_u32 s9, s19, s9 +; GCN-NEXT: s_mul_hi_u32 s20, s19, s14 ; GCN-NEXT: s_add_u32 s9, s9, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_add_u32 s9, s9, s19 -; GCN-NEXT: s_mul_hi_u32 s14, s15, s17 -; GCN-NEXT: s_addc_u32 s9, s20, s18 -; GCN-NEXT: s_addc_u32 s14, s14, 0 -; GCN-NEXT: s_mul_i32 s17, s15, s17 -; GCN-NEXT: s_add_u32 s9, s9, s17 -; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_add_u32 s9, s16, s9 -; GCN-NEXT: s_addc_u32 s18, s15, s14 +; GCN-NEXT: s_add_u32 s9, s9, s18 +; GCN-NEXT: s_mul_hi_u32 s15, s17, s14 +; GCN-NEXT: s_addc_u32 s9, s20, s16 +; GCN-NEXT: s_addc_u32 s15, s15, 0 +; GCN-NEXT: s_mul_i32 s14, s17, s14 +; GCN-NEXT: s_add_u32 s9, s9, s14 +; GCN-NEXT: s_addc_u32 s16, 0, s15 +; GCN-NEXT: s_add_u32 s9, s19, s9 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GCN-NEXT: s_addc_u32 s18, s17, s16 ; GCN-NEXT: s_ashr_i32 s14, s11, 31 ; GCN-NEXT: s_add_u32 s16, s10, s14 ; GCN-NEXT: s_mov_b32 s15, s14 @@ -2801,9 +2817,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s6, s9 ; GCN-NEXT: s_sub_u32 s9, s16, s9 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s16, s20, s7 ; GCN-NEXT: s_sub_u32 s22, s9, s6 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s23, s16, 0 ; GCN-NEXT: s_cmp_ge_u32 s23, s7 ; GCN-NEXT: s_cselect_b32 s24, -1, 0 @@ -2813,10 +2831,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s24, s25, s24 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s16, s16, s7 -; GCN-NEXT: s_sub_u32 s20, s22, s6 +; GCN-NEXT: s_sub_u32 s25, s22, s6 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s16, s16, 0 ; GCN-NEXT: s_cmp_lg_u32 s24, 0 -; GCN-NEXT: s_cselect_b32 s20, s20, s22 +; GCN-NEXT: s_cselect_b32 s20, s25, s22 ; GCN-NEXT: s_cselect_b32 s16, s16, s23 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s11, s17, s11 @@ -2867,7 +2887,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s12, 0, s11 +; GCN-NEXT: s_subb_u32 s14, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2876,52 +2896,56 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 -; GCN-NEXT: s_mul_i32 s16, s12, s14 -; GCN-NEXT: s_add_i32 s15, s17, s15 -; GCN-NEXT: s_add_i32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s14 -; GCN-NEXT: s_mul_i32 s17, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 +; GCN-NEXT: s_mul_i32 s16, s14, s12 +; GCN-NEXT: s_add_i32 s13, s17, s13 +; GCN-NEXT: s_add_i32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s12 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 -; GCN-NEXT: s_mul_i32 s18, s13, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 +; GCN-NEXT: s_mul_i32 s18, s15, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s13, s15, s13 +; GCN-NEXT: s_add_u32 s13, s16, s13 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s14, s14, s15 -; GCN-NEXT: s_addc_u32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 -; GCN-NEXT: s_add_i32 s15, s16, s15 -; GCN-NEXT: s_mul_i32 s12, s12, s14 -; GCN-NEXT: s_add_i32 s15, s15, s12 -; GCN-NEXT: s_mul_i32 s3, s3, s14 -; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 -; GCN-NEXT: s_mul_i32 s17, s13, s3 -; GCN-NEXT: s_mul_i32 s19, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 +; GCN-NEXT: s_add_u32 s17, s12, s13 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s12, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s14, s14, s17 +; GCN-NEXT: s_add_i32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 +; GCN-NEXT: s_mul_i32 s16, s15, s3 +; GCN-NEXT: s_mul_i32 s19, s17, s12 +; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 -; GCN-NEXT: s_addc_u32 s3, s18, s16 -; GCN-NEXT: s_addc_u32 s12, s12, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: s_add_u32 s3, s14, s3 -; GCN-NEXT: s_addc_u32 s16, s13, s12 +; GCN-NEXT: s_add_u32 s3, s3, s16 +; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 +; GCN-NEXT: s_addc_u32 s3, s18, s14 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s12, s15, s12 +; GCN-NEXT: s_add_u32 s3, s3, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s13 +; GCN-NEXT: s_add_u32 s3, s17, s3 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s16, s15, s14 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -2950,9 +2974,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -2962,10 +2988,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s18, s20, s10 +; GCN-NEXT: s_sub_u32 s23, s20, s10 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s18, s20 +; GCN-NEXT: s_cselect_b32 s18, s23, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -3435,9 +3463,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v0 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -3447,10 +3477,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s16, s18, s6 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s16, s18 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 @@ -4902,7 +4934,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GCN-NEXT: s_sub_u32 s17, 0, s6 -; GCN-NEXT: s_subb_u32 s22, 0, s7 +; GCN-NEXT: s_subb_u32 s24, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -4911,52 +4943,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s23, v1 -; GCN-NEXT: v_readfirstlane_b32 s24, v0 -; GCN-NEXT: s_mul_i32 s25, s17, s23 -; GCN-NEXT: s_mul_hi_u32 s27, s17, s24 -; GCN-NEXT: s_mul_i32 s26, s22, s24 -; GCN-NEXT: s_add_i32 s25, s27, s25 -; GCN-NEXT: s_add_i32 s25, s25, s26 -; GCN-NEXT: s_mul_i32 s28, s17, s24 -; GCN-NEXT: s_mul_i32 s27, s24, s25 -; GCN-NEXT: s_mul_hi_u32 s29, s24, s28 -; GCN-NEXT: s_mul_hi_u32 s26, s24, s25 +; GCN-NEXT: v_readfirstlane_b32 s25, v1 +; GCN-NEXT: v_readfirstlane_b32 s22, v0 +; GCN-NEXT: s_mul_i32 s23, s17, s25 +; GCN-NEXT: s_mul_hi_u32 s27, s17, s22 +; GCN-NEXT: s_mul_i32 s26, s24, s22 +; GCN-NEXT: s_add_i32 s23, s27, s23 +; GCN-NEXT: s_add_i32 s23, s23, s26 +; GCN-NEXT: s_mul_i32 s28, s17, s22 +; GCN-NEXT: s_mul_i32 s27, s22, s23 +; GCN-NEXT: s_mul_hi_u32 s29, s22, s28 +; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 ; GCN-NEXT: s_add_u32 s27, s29, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_mul_hi_u32 s30, s23, s28 -; GCN-NEXT: s_mul_i32 s28, s23, s28 +; GCN-NEXT: s_mul_hi_u32 s30, s25, s28 +; GCN-NEXT: s_mul_i32 s28, s25, s28 ; GCN-NEXT: s_add_u32 s27, s27, s28 -; GCN-NEXT: s_mul_hi_u32 s29, s23, s25 +; GCN-NEXT: s_mul_hi_u32 s29, s25, s23 ; GCN-NEXT: s_addc_u32 s26, s26, s30 ; GCN-NEXT: s_addc_u32 s27, s29, 0 -; GCN-NEXT: s_mul_i32 s25, s23, s25 -; GCN-NEXT: s_add_u32 s25, s26, s25 +; GCN-NEXT: s_mul_i32 s23, s25, s23 +; GCN-NEXT: s_add_u32 s23, s26, s23 ; GCN-NEXT: s_addc_u32 s26, 0, s27 -; GCN-NEXT: s_add_u32 s24, s24, s25 -; GCN-NEXT: s_addc_u32 s23, s23, s26 -; GCN-NEXT: s_mul_i32 s25, s17, s23 -; GCN-NEXT: s_mul_hi_u32 s26, s17, s24 -; GCN-NEXT: s_add_i32 s25, s26, s25 -; GCN-NEXT: s_mul_i32 s22, s22, s24 -; GCN-NEXT: s_add_i32 s25, s25, s22 -; GCN-NEXT: s_mul_i32 s17, s17, s24 -; GCN-NEXT: s_mul_hi_u32 s26, s23, s17 -; GCN-NEXT: s_mul_i32 s27, s23, s17 -; GCN-NEXT: s_mul_i32 s29, s24, s25 -; GCN-NEXT: s_mul_hi_u32 s17, s24, s17 -; GCN-NEXT: s_mul_hi_u32 s28, s24, s25 +; GCN-NEXT: s_add_u32 s27, s22, s23 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_addc_u32 s25, s25, s26 +; GCN-NEXT: s_mul_i32 s22, s17, s25 +; GCN-NEXT: s_mul_hi_u32 s23, s17, s27 +; GCN-NEXT: s_add_i32 s22, s23, s22 +; GCN-NEXT: s_mul_i32 s24, s24, s27 +; GCN-NEXT: s_add_i32 s22, s22, s24 +; GCN-NEXT: s_mul_i32 s17, s17, s27 +; GCN-NEXT: s_mul_hi_u32 s24, s25, s17 +; GCN-NEXT: s_mul_i32 s26, s25, s17 +; GCN-NEXT: s_mul_i32 s29, s27, s22 +; GCN-NEXT: s_mul_hi_u32 s17, s27, s17 +; GCN-NEXT: s_mul_hi_u32 s28, s27, s22 ; GCN-NEXT: s_add_u32 s17, s17, s29 ; GCN-NEXT: s_addc_u32 s28, 0, s28 -; GCN-NEXT: s_add_u32 s17, s17, s27 -; GCN-NEXT: s_mul_hi_u32 s22, s23, s25 -; GCN-NEXT: s_addc_u32 s17, s28, s26 -; GCN-NEXT: s_addc_u32 s22, s22, 0 -; GCN-NEXT: s_mul_i32 s25, s23, s25 -; GCN-NEXT: s_add_u32 s17, s17, s25 -; GCN-NEXT: s_addc_u32 s22, 0, s22 -; GCN-NEXT: s_add_u32 s17, s24, s17 -; GCN-NEXT: s_addc_u32 s26, s23, s22 +; GCN-NEXT: s_add_u32 s17, s17, s26 +; GCN-NEXT: s_mul_hi_u32 s23, s25, s22 +; GCN-NEXT: s_addc_u32 s17, s28, s24 +; GCN-NEXT: s_addc_u32 s23, s23, 0 +; GCN-NEXT: s_mul_i32 s22, s25, s22 +; GCN-NEXT: s_add_u32 s17, s17, s22 +; GCN-NEXT: s_addc_u32 s24, 0, s23 +; GCN-NEXT: s_add_u32 s17, s27, s17 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_addc_u32 s26, s25, s24 ; GCN-NEXT: s_ashr_i32 s22, s19, 31 ; GCN-NEXT: s_add_u32 s24, s18, s22 ; GCN-NEXT: s_mov_b32 s23, s22 @@ -4985,9 +5021,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s17, s6, s17 ; GCN-NEXT: s_sub_u32 s17, s24, s17 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s24, s28, s7 ; GCN-NEXT: s_sub_u32 s30, s17, s6 ; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s31, s24, 0 ; GCN-NEXT: s_cmp_ge_u32 s31, s7 ; GCN-NEXT: s_cselect_b32 s33, -1, 0 @@ -4997,10 +5035,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s33, s34, s33 ; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s24, s24, s7 -; GCN-NEXT: s_sub_u32 s28, s30, s6 +; GCN-NEXT: s_sub_u32 s34, s30, s6 +; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 ; GCN-NEXT: s_subb_u32 s24, s24, 0 ; GCN-NEXT: s_cmp_lg_u32 s33, 0 -; GCN-NEXT: s_cselect_b32 s28, s28, s30 +; GCN-NEXT: s_cselect_b32 s28, s34, s30 ; GCN-NEXT: s_cselect_b32 s24, s24, s31 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s19, s25, s19 @@ -5051,7 +5091,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19 ; GCN-NEXT: s_sub_u32 s13, 0, s18 -; GCN-NEXT: s_subb_u32 s20, 0, s19 +; GCN-NEXT: s_subb_u32 s22, 0, s19 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5060,52 +5100,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s21, v1 -; GCN-NEXT: v_readfirstlane_b32 s22, v0 -; GCN-NEXT: s_mul_i32 s23, s13, s21 -; GCN-NEXT: s_mul_hi_u32 s25, s13, s22 -; GCN-NEXT: s_mul_i32 s24, s20, s22 -; GCN-NEXT: s_add_i32 s23, s25, s23 -; GCN-NEXT: s_add_i32 s23, s23, s24 -; GCN-NEXT: s_mul_i32 s26, s13, s22 -; GCN-NEXT: s_mul_i32 s25, s22, s23 -; GCN-NEXT: s_mul_hi_u32 s27, s22, s26 -; GCN-NEXT: s_mul_hi_u32 s24, s22, s23 +; GCN-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NEXT: v_readfirstlane_b32 s20, v0 +; GCN-NEXT: s_mul_i32 s21, s13, s23 +; GCN-NEXT: s_mul_hi_u32 s25, s13, s20 +; GCN-NEXT: s_mul_i32 s24, s22, s20 +; GCN-NEXT: s_add_i32 s21, s25, s21 +; GCN-NEXT: s_add_i32 s21, s21, s24 +; GCN-NEXT: s_mul_i32 s26, s13, s20 +; GCN-NEXT: s_mul_i32 s25, s20, s21 +; GCN-NEXT: s_mul_hi_u32 s27, s20, s26 +; GCN-NEXT: s_mul_hi_u32 s24, s20, s21 ; GCN-NEXT: s_add_u32 s25, s27, s25 ; GCN-NEXT: s_addc_u32 s24, 0, s24 -; GCN-NEXT: s_mul_hi_u32 s28, s21, s26 -; GCN-NEXT: s_mul_i32 s26, s21, s26 +; GCN-NEXT: s_mul_hi_u32 s28, s23, s26 +; GCN-NEXT: s_mul_i32 s26, s23, s26 ; GCN-NEXT: s_add_u32 s25, s25, s26 -; GCN-NEXT: s_mul_hi_u32 s27, s21, s23 +; GCN-NEXT: s_mul_hi_u32 s27, s23, s21 ; GCN-NEXT: s_addc_u32 s24, s24, s28 ; GCN-NEXT: s_addc_u32 s25, s27, 0 -; GCN-NEXT: s_mul_i32 s23, s21, s23 -; GCN-NEXT: s_add_u32 s23, s24, s23 +; GCN-NEXT: s_mul_i32 s21, s23, s21 +; GCN-NEXT: s_add_u32 s21, s24, s21 ; GCN-NEXT: s_addc_u32 s24, 0, s25 -; GCN-NEXT: s_add_u32 s22, s22, s23 -; GCN-NEXT: s_addc_u32 s21, s21, s24 -; GCN-NEXT: s_mul_i32 s23, s13, s21 -; GCN-NEXT: s_mul_hi_u32 s24, s13, s22 -; GCN-NEXT: s_add_i32 s23, s24, s23 -; GCN-NEXT: s_mul_i32 s20, s20, s22 -; GCN-NEXT: s_add_i32 s23, s23, s20 -; GCN-NEXT: s_mul_i32 s13, s13, s22 -; GCN-NEXT: s_mul_hi_u32 s24, s21, s13 -; GCN-NEXT: s_mul_i32 s25, s21, s13 -; GCN-NEXT: s_mul_i32 s27, s22, s23 -; GCN-NEXT: s_mul_hi_u32 s13, s22, s13 -; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 +; GCN-NEXT: s_add_u32 s25, s20, s21 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_addc_u32 s23, s23, s24 +; GCN-NEXT: s_mul_i32 s20, s13, s23 +; GCN-NEXT: s_mul_hi_u32 s21, s13, s25 +; GCN-NEXT: s_add_i32 s20, s21, s20 +; GCN-NEXT: s_mul_i32 s22, s22, s25 +; GCN-NEXT: s_add_i32 s20, s20, s22 +; GCN-NEXT: s_mul_i32 s13, s13, s25 +; GCN-NEXT: s_mul_hi_u32 s22, s23, s13 +; GCN-NEXT: s_mul_i32 s24, s23, s13 +; GCN-NEXT: s_mul_i32 s27, s25, s20 +; GCN-NEXT: s_mul_hi_u32 s13, s25, s13 +; GCN-NEXT: s_mul_hi_u32 s26, s25, s20 ; GCN-NEXT: s_add_u32 s13, s13, s27 ; GCN-NEXT: s_addc_u32 s26, 0, s26 -; GCN-NEXT: s_add_u32 s13, s13, s25 -; GCN-NEXT: s_mul_hi_u32 s20, s21, s23 -; GCN-NEXT: s_addc_u32 s13, s26, s24 -; GCN-NEXT: s_addc_u32 s20, s20, 0 -; GCN-NEXT: s_mul_i32 s23, s21, s23 -; GCN-NEXT: s_add_u32 s13, s13, s23 -; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_add_u32 s13, s22, s13 -; GCN-NEXT: s_addc_u32 s24, s21, s20 +; GCN-NEXT: s_add_u32 s13, s13, s24 +; GCN-NEXT: s_mul_hi_u32 s21, s23, s20 +; GCN-NEXT: s_addc_u32 s13, s26, s22 +; GCN-NEXT: s_addc_u32 s21, s21, 0 +; GCN-NEXT: s_mul_i32 s20, s23, s20 +; GCN-NEXT: s_add_u32 s13, s13, s20 +; GCN-NEXT: s_addc_u32 s22, 0, s21 +; GCN-NEXT: s_add_u32 s13, s25, s13 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_addc_u32 s24, s23, s22 ; GCN-NEXT: s_ashr_i32 s20, s15, 31 ; GCN-NEXT: s_add_u32 s22, s14, s20 ; GCN-NEXT: s_mov_b32 s21, s20 @@ -5134,9 +5178,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s13, s18, s13 ; GCN-NEXT: s_sub_u32 s13, s22, s13 ; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s22, s26, s19 ; GCN-NEXT: s_sub_u32 s28, s13, s18 ; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s29, s22, 0 ; GCN-NEXT: s_cmp_ge_u32 s29, s19 ; GCN-NEXT: s_cselect_b32 s30, -1, 0 @@ -5146,10 +5192,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s30, s31, s30 ; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s22, s22, s19 -; GCN-NEXT: s_sub_u32 s26, s28, s18 +; GCN-NEXT: s_sub_u32 s31, s28, s18 +; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 ; GCN-NEXT: s_subb_u32 s22, s22, 0 ; GCN-NEXT: s_cmp_lg_u32 s30, 0 -; GCN-NEXT: s_cselect_b32 s26, s26, s28 +; GCN-NEXT: s_cselect_b32 s26, s31, s28 ; GCN-NEXT: s_cselect_b32 s22, s22, s29 ; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 ; GCN-NEXT: s_subb_u32 s15, s23, s15 @@ -5209,7 +5257,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 ; GCN-NEXT: s_sub_u32 s9, 0, s14 -; GCN-NEXT: s_subb_u32 s16, 0, s15 +; GCN-NEXT: s_subb_u32 s18, 0, s15 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5218,52 +5266,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s17, v1 -; GCN-NEXT: v_readfirstlane_b32 s18, v0 -; GCN-NEXT: s_mul_i32 s19, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s21, s9, s18 -; GCN-NEXT: s_mul_i32 s20, s16, s18 -; GCN-NEXT: s_add_i32 s19, s21, s19 -; GCN-NEXT: s_add_i32 s19, s19, s20 -; GCN-NEXT: s_mul_i32 s22, s9, s18 -; GCN-NEXT: s_mul_i32 s21, s18, s19 -; GCN-NEXT: s_mul_hi_u32 s23, s18, s22 -; GCN-NEXT: s_mul_hi_u32 s20, s18, s19 +; GCN-NEXT: v_readfirstlane_b32 s19, v1 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s17, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s21, s9, s16 +; GCN-NEXT: s_mul_i32 s20, s18, s16 +; GCN-NEXT: s_add_i32 s17, s21, s17 +; GCN-NEXT: s_add_i32 s17, s17, s20 +; GCN-NEXT: s_mul_i32 s22, s9, s16 +; GCN-NEXT: s_mul_i32 s21, s16, s17 +; GCN-NEXT: s_mul_hi_u32 s23, s16, s22 +; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 ; GCN-NEXT: s_add_u32 s21, s23, s21 ; GCN-NEXT: s_addc_u32 s20, 0, s20 -; GCN-NEXT: s_mul_hi_u32 s24, s17, s22 -; GCN-NEXT: s_mul_i32 s22, s17, s22 +; GCN-NEXT: s_mul_hi_u32 s24, s19, s22 +; GCN-NEXT: s_mul_i32 s22, s19, s22 ; GCN-NEXT: s_add_u32 s21, s21, s22 -; GCN-NEXT: s_mul_hi_u32 s23, s17, s19 +; GCN-NEXT: s_mul_hi_u32 s23, s19, s17 ; GCN-NEXT: s_addc_u32 s20, s20, s24 ; GCN-NEXT: s_addc_u32 s21, s23, 0 -; GCN-NEXT: s_mul_i32 s19, s17, s19 -; GCN-NEXT: s_add_u32 s19, s20, s19 +; GCN-NEXT: s_mul_i32 s17, s19, s17 +; GCN-NEXT: s_add_u32 s17, s20, s17 ; GCN-NEXT: s_addc_u32 s20, 0, s21 -; GCN-NEXT: s_add_u32 s18, s18, s19 -; GCN-NEXT: s_addc_u32 s17, s17, s20 -; GCN-NEXT: s_mul_i32 s19, s9, s17 -; GCN-NEXT: s_mul_hi_u32 s20, s9, s18 -; GCN-NEXT: s_add_i32 s19, s20, s19 -; GCN-NEXT: s_mul_i32 s16, s16, s18 -; GCN-NEXT: s_add_i32 s19, s19, s16 -; GCN-NEXT: s_mul_i32 s9, s9, s18 -; GCN-NEXT: s_mul_hi_u32 s20, s17, s9 -; GCN-NEXT: s_mul_i32 s21, s17, s9 -; GCN-NEXT: s_mul_i32 s23, s18, s19 -; GCN-NEXT: s_mul_hi_u32 s9, s18, s9 -; GCN-NEXT: s_mul_hi_u32 s22, s18, s19 +; GCN-NEXT: s_add_u32 s21, s16, s17 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_addc_u32 s19, s19, s20 +; GCN-NEXT: s_mul_i32 s16, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s17, s9, s21 +; GCN-NEXT: s_add_i32 s16, s17, s16 +; GCN-NEXT: s_mul_i32 s18, s18, s21 +; GCN-NEXT: s_add_i32 s16, s16, s18 +; GCN-NEXT: s_mul_i32 s9, s9, s21 +; GCN-NEXT: s_mul_hi_u32 s18, s19, s9 +; GCN-NEXT: s_mul_i32 s20, s19, s9 +; GCN-NEXT: s_mul_i32 s23, s21, s16 +; GCN-NEXT: s_mul_hi_u32 s9, s21, s9 +; GCN-NEXT: s_mul_hi_u32 s22, s21, s16 ; GCN-NEXT: s_add_u32 s9, s9, s23 ; GCN-NEXT: s_addc_u32 s22, 0, s22 -; GCN-NEXT: s_add_u32 s9, s9, s21 -; GCN-NEXT: s_mul_hi_u32 s16, s17, s19 -; GCN-NEXT: s_addc_u32 s9, s22, s20 -; GCN-NEXT: s_addc_u32 s16, s16, 0 -; GCN-NEXT: s_mul_i32 s19, s17, s19 -; GCN-NEXT: s_add_u32 s9, s9, s19 -; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_add_u32 s9, s18, s9 -; GCN-NEXT: s_addc_u32 s20, s17, s16 +; GCN-NEXT: s_add_u32 s9, s9, s20 +; GCN-NEXT: s_mul_hi_u32 s17, s19, s16 +; GCN-NEXT: s_addc_u32 s9, s22, s18 +; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mul_i32 s16, s19, s16 +; GCN-NEXT: s_add_u32 s9, s9, s16 +; GCN-NEXT: s_addc_u32 s18, 0, s17 +; GCN-NEXT: s_add_u32 s9, s21, s9 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_addc_u32 s20, s19, s18 ; GCN-NEXT: s_ashr_i32 s16, s11, 31 ; GCN-NEXT: s_add_u32 s18, s10, s16 ; GCN-NEXT: s_mov_b32 s17, s16 @@ -5292,9 +5344,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s9, s14, s9 ; GCN-NEXT: s_sub_u32 s9, s18, s9 ; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s18, s22, s15 ; GCN-NEXT: s_sub_u32 s24, s9, s14 ; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s25, s18, 0 ; GCN-NEXT: s_cmp_ge_u32 s25, s15 ; GCN-NEXT: s_cselect_b32 s26, -1, 0 @@ -5304,10 +5358,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s26, s27, s26 ; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s18, s18, s15 -; GCN-NEXT: s_sub_u32 s22, s24, s14 +; GCN-NEXT: s_sub_u32 s27, s24, s14 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 ; GCN-NEXT: s_subb_u32 s18, s18, 0 ; GCN-NEXT: s_cmp_lg_u32 s26, 0 -; GCN-NEXT: s_cselect_b32 s22, s22, s24 +; GCN-NEXT: s_cselect_b32 s22, s27, s24 ; GCN-NEXT: s_cselect_b32 s18, s18, s25 ; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 ; GCN-NEXT: s_subb_u32 s11, s19, s11 @@ -5364,7 +5420,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s3, 0, s10 -; GCN-NEXT: s_subb_u32 s12, 0, s11 +; GCN-NEXT: s_subb_u32 s14, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -5373,52 +5429,56 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s13, v1 -; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s17, s3, s14 -; GCN-NEXT: s_mul_i32 s16, s12, s14 -; GCN-NEXT: s_add_i32 s15, s17, s15 -; GCN-NEXT: s_add_i32 s15, s15, s16 -; GCN-NEXT: s_mul_i32 s18, s3, s14 -; GCN-NEXT: s_mul_i32 s17, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s19, s14, s18 -; GCN-NEXT: s_mul_hi_u32 s16, s14, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 +; GCN-NEXT: s_mul_i32 s16, s14, s12 +; GCN-NEXT: s_add_i32 s13, s17, s13 +; GCN-NEXT: s_add_i32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s12 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 ; GCN-NEXT: s_add_u32 s17, s19, s17 ; GCN-NEXT: s_addc_u32 s16, 0, s16 -; GCN-NEXT: s_mul_hi_u32 s20, s13, s18 -; GCN-NEXT: s_mul_i32 s18, s13, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 +; GCN-NEXT: s_mul_i32 s18, s15, s18 ; GCN-NEXT: s_add_u32 s17, s17, s18 -; GCN-NEXT: s_mul_hi_u32 s19, s13, s15 +; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 ; GCN-NEXT: s_addc_u32 s16, s16, s20 ; GCN-NEXT: s_addc_u32 s17, s19, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s15, s16, s15 +; GCN-NEXT: s_mul_i32 s13, s15, s13 +; GCN-NEXT: s_add_u32 s13, s16, s13 ; GCN-NEXT: s_addc_u32 s16, 0, s17 -; GCN-NEXT: s_add_u32 s14, s14, s15 -; GCN-NEXT: s_addc_u32 s13, s13, s16 -; GCN-NEXT: s_mul_i32 s15, s3, s13 -; GCN-NEXT: s_mul_hi_u32 s16, s3, s14 -; GCN-NEXT: s_add_i32 s15, s16, s15 -; GCN-NEXT: s_mul_i32 s12, s12, s14 -; GCN-NEXT: s_add_i32 s15, s15, s12 -; GCN-NEXT: s_mul_i32 s3, s3, s14 -; GCN-NEXT: s_mul_hi_u32 s16, s13, s3 -; GCN-NEXT: s_mul_i32 s17, s13, s3 -; GCN-NEXT: s_mul_i32 s19, s14, s15 -; GCN-NEXT: s_mul_hi_u32 s3, s14, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 +; GCN-NEXT: s_add_u32 s17, s12, s13 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s12, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s14, s14, s17 +; GCN-NEXT: s_add_i32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 +; GCN-NEXT: s_mul_i32 s16, s15, s3 +; GCN-NEXT: s_mul_i32 s19, s17, s12 +; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 ; GCN-NEXT: s_add_u32 s3, s3, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_add_u32 s3, s3, s17 -; GCN-NEXT: s_mul_hi_u32 s12, s13, s15 -; GCN-NEXT: s_addc_u32 s3, s18, s16 -; GCN-NEXT: s_addc_u32 s12, s12, 0 -; GCN-NEXT: s_mul_i32 s15, s13, s15 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_addc_u32 s12, 0, s12 -; GCN-NEXT: s_add_u32 s3, s14, s3 -; GCN-NEXT: s_addc_u32 s16, s13, s12 +; GCN-NEXT: s_add_u32 s3, s3, s16 +; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 +; GCN-NEXT: s_addc_u32 s3, s18, s14 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s12, s15, s12 +; GCN-NEXT: s_add_u32 s3, s3, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s13 +; GCN-NEXT: s_add_u32 s3, s17, s3 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s16, s15, s14 ; GCN-NEXT: s_ashr_i32 s12, s5, 31 ; GCN-NEXT: s_add_u32 s14, s4, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -5447,9 +5507,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s3, s10, s3 ; GCN-NEXT: s_sub_u32 s3, s14, s3 ; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s14, s18, s11 ; GCN-NEXT: s_sub_u32 s20, s3, s10 ; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s21, s14, 0 ; GCN-NEXT: s_cmp_ge_u32 s21, s11 ; GCN-NEXT: s_cselect_b32 s22, -1, 0 @@ -5459,10 +5521,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s22, s23, s22 ; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, s11 -; GCN-NEXT: s_sub_u32 s18, s20, s10 +; GCN-NEXT: s_sub_u32 s23, s20, s10 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 ; GCN-NEXT: s_subb_u32 s14, s14, 0 ; GCN-NEXT: s_cmp_lg_u32 s22, 0 -; GCN-NEXT: s_cselect_b32 s18, s18, s20 +; GCN-NEXT: s_cselect_b32 s18, s23, s20 ; GCN-NEXT: s_cselect_b32 s14, s14, s21 ; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 ; GCN-NEXT: s_subb_u32 s5, s15, s5 @@ -6235,9 +6299,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s14, v8 ; TONGA-NEXT: s_sub_u32 s12, s12, s14 ; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 ; TONGA-NEXT: s_sub_u32 s18, s12, s6 ; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s19, s1, 0 ; TONGA-NEXT: s_cmp_ge_u32 s19, s7 ; TONGA-NEXT: s_cselect_b32 s20, -1, 0 @@ -6247,10 +6313,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_cselect_b32 s20, s21, s20 ; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, s7 -; TONGA-NEXT: s_sub_u32 s16, s18, s6 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 ; TONGA-NEXT: s_subb_u32 s1, s1, 0 ; TONGA-NEXT: s_cmp_lg_u32 s20, 0 -; TONGA-NEXT: s_cselect_b32 s16, s16, s18 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 ; TONGA-NEXT: s_cselect_b32 s1, s1, s19 ; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 ; TONGA-NEXT: s_subb_u32 s3, s13, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index ea9bb04..33b0a5d 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -51,9 +51,10 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s0, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -84,6 +85,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_add_u32 s11, s14, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s1, s12, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 @@ -113,43 +115,46 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_readfirstlane_b32 s10, v0 ; GCN-NEXT: s_add_i32 s5, s10, s5 ; GCN-NEXT: s_mul_i32 s10, s9, s4 -; GCN-NEXT: s_add_i32 s12, s5, s10 -; GCN-NEXT: s_sub_i32 s10, s7, s12 +; GCN-NEXT: s_add_i32 s10, s5, s10 +; GCN-NEXT: s_sub_i32 s11, s7, s10 ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 -; GCN-NEXT: s_subb_u32 s13, s10, s9 -; GCN-NEXT: s_sub_u32 s14, s6, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 -; GCN-NEXT: s_subb_u32 s15, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s9 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s8 -; GCN-NEXT: s_cselect_b32 s17, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s15, s9 -; GCN-NEXT: s_cselect_b32 s16, s17, s16 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 -; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 -; GCN-NEXT: s_cselect_b32 s10, s10, s15 +; GCN-NEXT: s_or_b32 s12, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s13, s6, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_subb_u32 s4, s7, s12 -; GCN-NEXT: s_cmp_ge_u32 s4, s9 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s14, s11, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s9 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s8 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s9 +; GCN-NEXT: s_cselect_b32 s15, s15, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s16, s13, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s4, s11, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s5, s16, s13 +; GCN-NEXT: s_cselect_b32 s4, s4, s14 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s10 +; GCN-NEXT: s_cmp_ge_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s8 -; GCN-NEXT: s_cselect_b32 s7, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, s9 -; GCN-NEXT: s_cselect_b32 s5, s7, s5 -; GCN-NEXT: s_cmp_lg_u32 s5, 0 -; GCN-NEXT: s_cselect_b32 s4, s10, s4 -; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_cselect_b32 s8, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s8, s8, s10 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, s7 +; GCN-NEXT: s_cselect_b32 s5, s5, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -182,6 +187,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 +; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -215,6 +221,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 +; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -1009,9 +1016,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s8, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s8, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s9, v0 @@ -1042,6 +1050,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_add_u32 s11, s14, s8 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: s_addc_u32 s10, s12, s10 ; GCN-NEXT: s_ashr_i32 s8, s7, 31 ; GCN-NEXT: s_add_u32 s6, s6, s8 @@ -1074,43 +1083,46 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_readfirstlane_b32 s12, v0 ; GCN-NEXT: s_add_i32 s11, s12, s11 ; GCN-NEXT: s_mul_i32 s12, s5, s10 -; GCN-NEXT: s_add_i32 s14, s11, s12 -; GCN-NEXT: s_sub_i32 s12, s7, s14 +; GCN-NEXT: s_add_i32 s12, s11, s12 +; GCN-NEXT: s_sub_i32 s13, s7, s12 ; GCN-NEXT: s_mul_i32 s10, s4, s10 ; GCN-NEXT: s_sub_u32 s6, s6, s10 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s13, s10, s11 -; GCN-NEXT: s_subb_u32 s15, s12, s5 -; GCN-NEXT: s_sub_u32 s16, s6, s4 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s17, s12, s13 -; GCN-NEXT: s_subb_u32 s17, s15, 0 -; GCN-NEXT: s_cmp_ge_u32 s17, s5 -; GCN-NEXT: s_cselect_b32 s18, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s16, s4 -; GCN-NEXT: s_cselect_b32 s19, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s17, s5 -; GCN-NEXT: s_cselect_b32 s18, s19, s18 -; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s15, s15, s5 -; GCN-NEXT: s_sub_u32 s19, s16, s4 -; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GCN-NEXT: s_or_b32 s12, s12, s13 -; GCN-NEXT: s_subb_u32 s12, s15, 0 -; GCN-NEXT: s_cmp_lg_u32 s18, 0 -; GCN-NEXT: s_cselect_b32 s13, s19, s16 -; GCN-NEXT: s_cselect_b32 s12, s12, s17 +; GCN-NEXT: s_or_b32 s14, s10, s11 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s13, s13, s5 +; GCN-NEXT: s_sub_u32 s15, s6, s4 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_subb_u32 s16, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s16, s5 +; GCN-NEXT: s_cselect_b32 s11, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s4 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s16, s5 +; GCN-NEXT: s_cselect_b32 s17, s17, s11 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_subb_u32 s13, s13, s5 +; GCN-NEXT: s_sub_u32 s18, s15, s4 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s7, s7, s14 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_cmp_lg_u32 s17, 0 +; GCN-NEXT: s_cselect_b32 s11, s18, s15 +; GCN-NEXT: s_cselect_b32 s10, s10, s16 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s12 ; GCN-NEXT: s_cmp_ge_u32 s7, s5 -; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s4 ; GCN-NEXT: s_cselect_b32 s4, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s7, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s10 +; GCN-NEXT: s_cselect_b32 s4, s4, s12 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s5, s12, s7 -; GCN-NEXT: s_cselect_b32 s4, s13, s6 +; GCN-NEXT: s_cselect_b32 s5, s10, s7 +; GCN-NEXT: s_cselect_b32 s4, s11, s6 ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GCN-NEXT: s_sub_u32 s4, s4, s8 ; GCN-NEXT: s_subb_u32 s5, s5, s8 @@ -1158,6 +1170,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s16, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s10, s10, s11 +; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0 ; GCN-IR-NEXT: s_addc_u32 s10, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 @@ -1191,6 +1204,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s18, s18, 1 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_or_b32 s20, s20, s21 +; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0 ; GCN-IR-NEXT: s_addc_u32 s19, s19, 0 ; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -1355,9 +1369,10 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s6, s7 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s6, s2, s9 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 @@ -1388,6 +1403,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s2, s11, s2 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_addc_u32 s6, s9, s8 ; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s6, 24 @@ -1402,42 +1418,45 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mul_i32 s7, s5, s6 ; GCN-NEXT: s_mul_i32 s6, s4, s6 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_add_i32 s10, s8, s7 -; GCN-NEXT: s_sub_i32 s8, 0, s10 -; GCN-NEXT: s_sub_u32 s11, 24, s6 +; GCN-NEXT: s_add_i32 s8, s8, s7 +; GCN-NEXT: s_sub_i32 s9, 0, s8 +; GCN-NEXT: s_sub_u32 s10, 24, s6 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s11, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s9, s9, s5 +; GCN-NEXT: s_sub_u32 s12, s10, s4 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b32 s9, s6, s7 -; GCN-NEXT: s_subb_u32 s12, s8, s5 -; GCN-NEXT: s_sub_u32 s13, s11, s4 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 -; GCN-NEXT: s_subb_u32 s14, s12, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s5 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s4 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s5 -; GCN-NEXT: s_cselect_b32 s15, s16, s15 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s5 -; GCN-NEXT: s_sub_u32 s16, s13, s4 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 -; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 -; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_subb_u32 s6, 0, s10 -; GCN-NEXT: s_cmp_ge_u32 s6, s5 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_subb_u32 s13, s9, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s5 ; GCN-NEXT: s_cselect_b32 s7, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s11, s4 +; GCN-NEXT: s_cmp_ge_u32 s12, s4 +; GCN-NEXT: s_cselect_b32 s14, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s13, s5 +; GCN-NEXT: s_cselect_b32 s14, s14, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_subb_u32 s9, s9, s5 +; GCN-NEXT: s_sub_u32 s15, s12, s4 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_subb_u32 s6, s9, 0 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_cselect_b32 s7, s15, s12 +; GCN-NEXT: s_cselect_b32 s6, s6, s13 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s8, 0, s8 +; GCN-NEXT: s_cmp_ge_u32 s8, s5 +; GCN-NEXT: s_cselect_b32 s9, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s10, s4 ; GCN-NEXT: s_cselect_b32 s4, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s6, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s7 +; GCN-NEXT: s_cmp_eq_u32 s8, s5 +; GCN-NEXT: s_cselect_b32 s4, s4, s9 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s4, s8, s6 -; GCN-NEXT: s_cselect_b32 s5, s9, s11 +; GCN-NEXT: s_cselect_b32 s4, s6, s8 +; GCN-NEXT: s_cselect_b32 s5, s7, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1470,6 +1489,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s8, s2, 1 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_or_b32 s9, s10, s11 +; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 @@ -1502,6 +1522,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 +; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index bdd22f25..bb5918b2 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -18,6 +18,7 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_addc_u32 s3, s3, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -34,9 +35,11 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s2, s4 -; VI-NEXT: s_addc_u32 s3, s3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_addc_u32 s3, s3, s5 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 @@ -50,12 +53,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s4, s2, s6 -; GFX9-NEXT: s_addc_u32 s5, s3, s7 +; GFX9-NEXT: s_add_u32 s6, s2, s6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s4, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -68,6 +73,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_addc_u32 s3, s3, s7 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -84,12 +91,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 +; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -435,6 +444,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_add_u32 s4, s4, s6 ; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; SI-NEXT: s_or_b32 s6, s12, s13 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -455,14 +465,16 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_add_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_addc_u32 s0, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -474,10 +486,12 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s12, s14 -; GFX9-NEXT: s_addc_u32 s1, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_add_u32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_addc_u32 s0, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -490,8 +504,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s12, s14 -; GFX10-NEXT: s_addc_u32 s1, s13, s15 +; GFX10-NEXT: s_cselect_b32 s1, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_addc_u32 s1, s13, s15 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -504,8 +520,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_addc_u32 s5, s5, s7 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_addc_u32 s5, s5, s7 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index fd461ac..41199b0 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -148,6 +148,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 +; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -181,6 +182,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 +; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5] @@ -829,9 +831,10 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s4, s6, s9 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 @@ -862,6 +865,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s8, s11, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s4, s9, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 @@ -870,50 +874,52 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_readfirstlane_b32 s8, v1 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_add_u32 s4, s8, s4 -; GCN-NEXT: s_addc_u32 s10, 0, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_addc_u32 s8, 0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mul_i32 s0, s3, s10 +; GCN-NEXT: s_mul_i32 s0, s3, s8 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s11, s1, s0 -; GCN-NEXT: s_sub_i32 s8, 0, s11 -; GCN-NEXT: s_mul_i32 s0, s2, s10 -; GCN-NEXT: s_sub_u32 s12, 24, s0 +; GCN-NEXT: s_add_i32 s9, s1, s0 +; GCN-NEXT: s_sub_i32 s10, 0, s9 +; GCN-NEXT: s_mul_i32 s0, s2, s8 +; GCN-NEXT: s_sub_u32 s11, 24, s0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s12, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s13, s11, s2 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s9, s0, s1 -; GCN-NEXT: s_subb_u32 s13, s8, s3 -; GCN-NEXT: s_sub_u32 s14, s12, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s8, s3 -; GCN-NEXT: s_cselect_b32 s9, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s2 -; GCN-NEXT: s_cselect_b32 s13, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, s3 -; GCN-NEXT: s_cselect_b32 s8, s13, s9 -; GCN-NEXT: s_add_u32 s9, s10, 1 -; GCN-NEXT: s_addc_u32 s13, 0, 0 -; GCN-NEXT: s_add_u32 s14, s10, 2 -; GCN-NEXT: s_addc_u32 s15, 0, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s8, s14, s9 -; GCN-NEXT: s_cselect_b32 s9, s15, s13 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_subb_u32 s0, 0, s11 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s0, s10, 0 ; GCN-NEXT: s_cmp_ge_u32 s0, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s12, s2 -; GCN-NEXT: s_cselect_b32 s2, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s2 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s0, s3 -; GCN-NEXT: s_cselect_b32 s0, s2, s1 +; GCN-NEXT: s_cselect_b32 s0, s10, s1 +; GCN-NEXT: s_add_u32 s1, s8, 1 +; GCN-NEXT: s_addc_u32 s10, 0, 0 +; GCN-NEXT: s_add_u32 s13, s8, 2 +; GCN-NEXT: s_addc_u32 s14, 0, 0 ; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b32 s0, s9, 0 -; GCN-NEXT: s_cselect_b32 s1, s8, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_cselect_b32 s0, s13, s1 +; GCN-NEXT: s_cselect_b32 s1, s14, s10 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s9, 0, s9 +; GCN-NEXT: s_cmp_ge_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s11, s2 +; GCN-NEXT: s_cselect_b32 s2, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s2, s2, s10 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -939,6 +945,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 +; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -971,6 +978,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 +; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1309,6 +1317,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 +; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1338,6 +1347,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_or_b32 s12, s12, s13 +; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 137dc1f..cdcc914 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -51,9 +51,10 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_addc_u32 s13, 0, s14 ; GCN-NEXT: s_add_u32 s14, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s12, s12, s13 ; GCN-NEXT: s_mul_i32 s0, s10, s12 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -84,6 +85,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_add_u32 s11, s14, s0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_addc_u32 s1, s12, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 @@ -113,43 +115,46 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: v_readfirstlane_b32 s10, v0 ; GCN-NEXT: s_add_i32 s5, s10, s5 ; GCN-NEXT: s_mul_i32 s10, s9, s4 -; GCN-NEXT: s_add_i32 s12, s5, s10 -; GCN-NEXT: s_sub_i32 s10, s7, s12 +; GCN-NEXT: s_add_i32 s10, s5, s10 +; GCN-NEXT: s_sub_i32 s11, s7, s10 ; GCN-NEXT: s_mul_i32 s4, s8, s4 ; GCN-NEXT: s_sub_u32 s6, s6, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_or_b32 s11, s4, s5 -; GCN-NEXT: s_subb_u32 s13, s10, s9 -; GCN-NEXT: s_sub_u32 s14, s6, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s15, s10, s11 -; GCN-NEXT: s_subb_u32 s15, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s9 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s8 -; GCN-NEXT: s_cselect_b32 s17, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s15, s9 -; GCN-NEXT: s_cselect_b32 s16, s17, s16 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s13, s13, s9 -; GCN-NEXT: s_sub_u32 s17, s14, s8 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_or_b32 s10, s10, s11 -; GCN-NEXT: s_subb_u32 s10, s13, 0 -; GCN-NEXT: s_cmp_lg_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s11, s17, s14 -; GCN-NEXT: s_cselect_b32 s10, s10, s15 +; GCN-NEXT: s_or_b32 s12, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s13, s6, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_subb_u32 s4, s7, s12 -; GCN-NEXT: s_cmp_ge_u32 s4, s9 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s14, s11, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s9 ; GCN-NEXT: s_cselect_b32 s5, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s8 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s9 +; GCN-NEXT: s_cselect_b32 s15, s15, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s16, s13, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s4, s11, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s5, s16, s13 +; GCN-NEXT: s_cselect_b32 s4, s4, s14 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s10 +; GCN-NEXT: s_cmp_ge_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 ; GCN-NEXT: s_cmp_ge_u32 s6, s8 -; GCN-NEXT: s_cselect_b32 s7, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, s9 -; GCN-NEXT: s_cselect_b32 s5, s7, s5 -; GCN-NEXT: s_cmp_lg_u32 s5, 0 -; GCN-NEXT: s_cselect_b32 s4, s10, s4 -; GCN-NEXT: s_cselect_b32 s5, s11, s6 +; GCN-NEXT: s_cselect_b32 s8, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s8, s8, s10 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, s7 +; GCN-NEXT: s_cselect_b32 s5, s5, s6 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -182,6 +187,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_or_b32 s8, s8, s9 +; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-IR-NEXT: s_addc_u32 s8, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -215,6 +221,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_add_u32 s16, s16, 1 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_or_b32 s18, s18, s19 +; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0 ; GCN-IR-NEXT: s_addc_u32 s17, s17, 0 ; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -846,9 +853,10 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_addc_u32 s10, 0, s11 ; GCN-NEXT: s_add_u32 s11, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s9, s9, s10 ; GCN-NEXT: s_mul_i32 s4, s6, s9 ; GCN-NEXT: v_readfirstlane_b32 s5, v0 @@ -879,6 +887,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_add_u32 s8, s11, s4 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_addc_u32 s4, s9, s6 ; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 ; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 @@ -894,43 +903,46 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mul_i32 s0, s3, s8 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_add_i32 s10, s1, s0 -; GCN-NEXT: s_sub_i32 s9, 0, s10 +; GCN-NEXT: s_add_i32 s9, s1, s0 +; GCN-NEXT: s_sub_i32 s10, 0, s9 ; GCN-NEXT: s_mul_i32 s0, s2, s8 -; GCN-NEXT: s_sub_u32 s11, 24, s0 +; GCN-NEXT: s_sub_u32 s8, 24, s0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s11, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s12, s8, s2 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_or_b32 s8, s0, s1 -; GCN-NEXT: s_subb_u32 s12, s9, s3 -; GCN-NEXT: s_sub_u32 s13, s11, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s14, s8, s9 -; GCN-NEXT: s_subb_u32 s14, s12, 0 -; GCN-NEXT: s_cmp_ge_u32 s14, s3 -; GCN-NEXT: s_cselect_b32 s15, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s13, s2 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s14, s3 -; GCN-NEXT: s_cselect_b32 s15, s16, s15 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s12, s12, s3 -; GCN-NEXT: s_sub_u32 s16, s13, s2 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_or_b32 s8, s8, s9 -; GCN-NEXT: s_subb_u32 s8, s12, 0 -; GCN-NEXT: s_cmp_lg_u32 s15, 0 -; GCN-NEXT: s_cselect_b32 s9, s16, s13 -; GCN-NEXT: s_cselect_b32 s8, s8, s14 ; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_subb_u32 s0, 0, s10 -; GCN-NEXT: s_cmp_ge_u32 s0, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s13, s10, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s3 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: s_cmp_ge_u32 s11, s2 +; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cselect_b32 s14, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s13, s3 +; GCN-NEXT: s_cselect_b32 s14, s14, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s15, s12, s2 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s0, s10, 0 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_cselect_b32 s1, s15, s12 +; GCN-NEXT: s_cselect_b32 s0, s0, s13 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s9, 0, s9 +; GCN-NEXT: s_cmp_ge_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s8, s2 ; GCN-NEXT: s_cselect_b32 s2, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s0, s3 -; GCN-NEXT: s_cselect_b32 s1, s2, s1 -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_cselect_b32 s0, s8, s0 -; GCN-NEXT: s_cselect_b32 s1, s9, s11 +; GCN-NEXT: s_cmp_eq_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s2, s2, s10 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, s9 +; GCN-NEXT: s_cselect_b32 s1, s1, s8 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -958,6 +970,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 +; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -990,6 +1003,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s14, s14, 1 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_or_b32 s16, s16, s17 +; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0 ; GCN-IR-NEXT: s_addc_u32 s15, s15, 0 ; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1079,6 +1093,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s11, s8, 1 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_or_b32 s6, s6, s7 +; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-IR-NEXT: s_addc_u32 s6, s9, 0 ; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -1108,6 +1123,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_or_b32 s14, s14, s15 +; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index e8db647..d67a7b1 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -18,6 +18,7 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_subb_u32 s3, s3, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -34,9 +35,11 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s2, s2, s4 -; VI-NEXT: s_subb_u32 s3, s3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_subb_u32 s3, s3, s5 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 @@ -50,12 +53,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s4, s2, s6 -; GFX9-NEXT: s_subb_u32 s5, s3, s7 +; GFX9-NEXT: s_sub_u32 s6, s2, s6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_subb_u32 s4, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -68,6 +73,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_subb_u32 s3, s3, s7 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 @@ -84,12 +91,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, s2, s4 +; GFX11-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 ; GFX11-NEXT: s_subb_u32 s3, s3, s5 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -434,6 +443,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_sub_u32 s4, s4, s6 ; SI-NEXT: s_cselect_b64 s[12:13], -1, 0 ; SI-NEXT: s_or_b32 s6, s12, s13 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_subb_u32 s5, s5, s7 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -454,14 +464,16 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_sub_u32 s2, s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_sub_u32 s0, s4, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_subb_u32 s1, s5, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_subb_u32 s0, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -473,10 +485,12 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s12, s14 -; GFX9-NEXT: s_subb_u32 s1, s13, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_sub_u32 s2, s12, s14 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_subb_u32 s0, s13, s15 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -489,8 +503,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_u32 s0, s12, s14 -; GFX10-NEXT: s_subb_u32 s1, s13, s15 +; GFX10-NEXT: s_cselect_b32 s1, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_subb_u32 s1, s13, s15 ; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 @@ -503,8 +519,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s4, s4, s6 -; GFX11-NEXT: s_subb_u32 s5, s5, s7 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_subb_u32 s5, s5, s7 ; GFX11-NEXT: s_cselect_b32 s4, -1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 28c6b40..75db387 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -774,40 +774,44 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_add_u32 s11, s12, s11 ; GFX1032-NEXT: s_addc_u32 s12, 0, s13 ; GFX1032-NEXT: s_add_u32 s8, s8, s11 +; GFX1032-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1032-NEXT: s_mul_i32 s11, s9, s8 ; GFX1032-NEXT: s_addc_u32 s5, s5, s12 -; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1032-NEXT: s_mul_i32 s12, s9, s8 -; GFX1032-NEXT: s_mul_i32 s9, s9, s5 ; GFX1032-NEXT: s_mul_i32 s10, s10, s8 -; GFX1032-NEXT: s_add_i32 s9, s11, s9 -; GFX1032-NEXT: s_mul_i32 s11, s5, s12 +; GFX1032-NEXT: s_mul_i32 s9, s9, s5 +; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1032-NEXT: s_add_i32 s9, s13, s9 +; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11 ; GFX1032-NEXT: s_add_i32 s9, s9, s10 -; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1032-NEXT: s_mul_i32 s10, s5, s11 ; GFX1032-NEXT: s_mul_i32 s15, s8, s9 ; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9 -; GFX1032-NEXT: s_add_u32 s10, s10, s15 -; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12 +; GFX1032-NEXT: s_add_u32 s12, s12, s15 ; GFX1032-NEXT: s_addc_u32 s14, 0, s14 -; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9 -; GFX1032-NEXT: s_add_u32 s10, s10, s11 +; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9 +; GFX1032-NEXT: s_add_u32 s10, s12, s10 ; GFX1032-NEXT: s_mul_i32 s9, s5, s9 ; GFX1032-NEXT: s_addc_u32 s10, s14, s13 -; GFX1032-NEXT: s_addc_u32 s11, s12, 0 +; GFX1032-NEXT: s_addc_u32 s11, s11, 0 ; GFX1032-NEXT: s_add_u32 s9, s10, s9 ; GFX1032-NEXT: s_addc_u32 s10, 0, s11 ; GFX1032-NEXT: s_add_u32 s8, s8, s9 +; GFX1032-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1032-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8 ; GFX1032-NEXT: s_addc_u32 s5, s5, s10 -; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1032-NEXT: s_mul_i32 s12, s2, s5 -; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5 -; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8 ; GFX1032-NEXT: s_mul_i32 s8, s3, s8 -; GFX1032-NEXT: s_add_u32 s9, s9, s12 -; GFX1032-NEXT: s_addc_u32 s11, 0, s11 +; GFX1032-NEXT: s_mul_i32 s12, s2, s5 +; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5 +; GFX1032-NEXT: s_add_u32 s11, s11, s12 +; GFX1032-NEXT: s_addc_u32 s10, 0, s10 ; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1032-NEXT: s_add_u32 s8, s9, s8 +; GFX1032-NEXT: s_add_u32 s8, s11, s8 ; GFX1032-NEXT: s_mul_i32 s5, s3, s5 -; GFX1032-NEXT: s_addc_u32 s8, s11, s10 +; GFX1032-NEXT: s_addc_u32 s8, s10, s9 ; GFX1032-NEXT: s_addc_u32 s9, s13, 0 ; GFX1032-NEXT: s_add_u32 s5, s8, s5 ; GFX1032-NEXT: s_addc_u32 s8, 0, s9 @@ -820,8 +824,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_sub_i32 s11, s3, s9 ; GFX1032-NEXT: s_sub_u32 s10, s2, s10 ; GFX1032-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1032-NEXT: s_subb_u32 s11, s11, s1 ; GFX1032-NEXT: s_sub_u32 s13, s10, s0 +; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1032-NEXT: s_subb_u32 s11, s11, 0 ; GFX1032-NEXT: s_cmp_ge_u32 s11, s1 ; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 @@ -894,8 +901,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX1064-NEXT: s_sub_u32 s8, 0, s0 -; GFX1064-NEXT: s_subb_u32 s9, 0, s1 +; GFX1064-NEXT: s_sub_u32 s9, 0, s0 +; GFX1064-NEXT: s_subb_u32 s10, 0, s1 ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -904,102 +911,109 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s5, v0 -; GFX1064-NEXT: s_mul_i32 s10, s8, s4 -; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5 -; GFX1064-NEXT: s_mul_i32 s11, s9, s5 -; GFX1064-NEXT: s_add_i32 s10, s12, s10 -; GFX1064-NEXT: s_mul_i32 s13, s8, s5 -; GFX1064-NEXT: s_add_i32 s10, s10, s11 -; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13 -; GFX1064-NEXT: s_mul_i32 s15, s5, s10 -; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13 -; GFX1064-NEXT: s_mul_i32 s11, s4, s13 -; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10 +; GFX1064-NEXT: v_readfirstlane_b32 s8, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1064-NEXT: s_mul_i32 s5, s9, s8 +; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4 +; GFX1064-NEXT: s_mul_i32 s11, s10, s4 +; GFX1064-NEXT: s_add_i32 s5, s12, s5 +; GFX1064-NEXT: s_mul_i32 s13, s9, s4 +; GFX1064-NEXT: s_add_i32 s5, s5, s11 +; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13 +; GFX1064-NEXT: s_mul_i32 s15, s4, s5 +; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13 +; GFX1064-NEXT: s_mul_i32 s11, s8, s13 +; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5 ; GFX1064-NEXT: s_add_u32 s12, s12, s15 ; GFX1064-NEXT: s_addc_u32 s13, 0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10 +; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5 ; GFX1064-NEXT: s_add_u32 s11, s12, s11 -; GFX1064-NEXT: s_mul_i32 s10, s4, s10 +; GFX1064-NEXT: s_mul_i32 s5, s8, s5 ; GFX1064-NEXT: s_addc_u32 s11, s13, s14 ; GFX1064-NEXT: s_addc_u32 s12, s16, 0 -; GFX1064-NEXT: s_add_u32 s10, s11, s10 +; GFX1064-NEXT: s_add_u32 s5, s11, s5 ; GFX1064-NEXT: s_addc_u32 s11, 0, s12 -; GFX1064-NEXT: s_add_u32 s5, s5, s10 -; GFX1064-NEXT: s_addc_u32 s4, s4, s11 -; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5 -; GFX1064-NEXT: s_mul_i32 s11, s8, s5 -; GFX1064-NEXT: s_mul_i32 s8, s8, s4 -; GFX1064-NEXT: s_mul_i32 s9, s9, s5 -; GFX1064-NEXT: s_add_i32 s8, s10, s8 -; GFX1064-NEXT: s_mul_i32 s10, s4, s11 -; GFX1064-NEXT: s_add_i32 s8, s8, s9 -; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11 -; GFX1064-NEXT: s_mul_i32 s14, s5, s8 -; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8 -; GFX1064-NEXT: s_add_u32 s9, s9, s14 -; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11 +; GFX1064-NEXT: s_add_u32 s12, s4, s5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_mul_i32 s4, s9, s12 +; GFX1064-NEXT: s_addc_u32 s8, s8, s11 +; GFX1064-NEXT: s_mul_i32 s10, s10, s12 +; GFX1064-NEXT: s_mul_i32 s9, s9, s8 +; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4 +; GFX1064-NEXT: s_add_i32 s9, s13, s9 +; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4 +; GFX1064-NEXT: s_add_i32 s9, s9, s10 +; GFX1064-NEXT: s_mul_i32 s4, s8, s4 +; GFX1064-NEXT: s_mul_i32 s14, s12, s9 +; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9 +; GFX1064-NEXT: s_add_u32 s5, s5, s14 ; GFX1064-NEXT: s_addc_u32 s13, 0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8 -; GFX1064-NEXT: s_add_u32 s9, s9, s10 -; GFX1064-NEXT: s_mul_i32 s8, s4, s8 -; GFX1064-NEXT: s_addc_u32 s9, s13, s12 -; GFX1064-NEXT: s_addc_u32 s10, s11, 0 -; GFX1064-NEXT: s_add_u32 s8, s9, s8 -; GFX1064-NEXT: s_addc_u32 s9, 0, s10 -; GFX1064-NEXT: s_add_u32 s5, s5, s8 -; GFX1064-NEXT: s_addc_u32 s4, s4, s9 -; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5 -; GFX1064-NEXT: s_mul_i32 s11, s2, s4 -; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4 -; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5 +; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9 +; GFX1064-NEXT: s_add_u32 s4, s5, s4 +; GFX1064-NEXT: s_mul_i32 s9, s8, s9 +; GFX1064-NEXT: s_addc_u32 s4, s13, s11 +; GFX1064-NEXT: s_addc_u32 s5, s10, 0 +; GFX1064-NEXT: s_add_u32 s4, s4, s9 +; GFX1064-NEXT: s_addc_u32 s9, 0, s5 +; GFX1064-NEXT: s_add_u32 s10, s12, s4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10 +; GFX1064-NEXT: s_addc_u32 s5, s8, s9 +; GFX1064-NEXT: s_mul_i32 s8, s3, s10 +; GFX1064-NEXT: s_mul_i32 s10, s2, s5 +; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5 +; GFX1064-NEXT: s_add_u32 s10, s11, s10 +; GFX1064-NEXT: s_addc_u32 s9, 0, s9 +; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5 +; GFX1064-NEXT: s_add_u32 s8, s10, s8 ; GFX1064-NEXT: s_mul_i32 s5, s3, s5 -; GFX1064-NEXT: s_add_u32 s8, s8, s11 -; GFX1064-NEXT: s_addc_u32 s10, 0, s10 -; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4 -; GFX1064-NEXT: s_add_u32 s5, s8, s5 -; GFX1064-NEXT: s_mul_i32 s4, s3, s4 -; GFX1064-NEXT: s_addc_u32 s5, s10, s9 +; GFX1064-NEXT: s_addc_u32 s4, s9, s4 ; GFX1064-NEXT: s_addc_u32 s8, s12, 0 -; GFX1064-NEXT: s_add_u32 s10, s5, s4 +; GFX1064-NEXT: s_add_u32 s10, s4, s5 ; GFX1064-NEXT: s_addc_u32 s11, 0, s8 ; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10 ; GFX1064-NEXT: s_mul_i32 s5, s0, s11 ; GFX1064-NEXT: s_mul_i32 s8, s1, s10 ; GFX1064-NEXT: s_add_i32 s4, s4, s5 -; GFX1064-NEXT: s_add_i32 s8, s4, s8 +; GFX1064-NEXT: s_add_i32 s12, s4, s8 ; GFX1064-NEXT: s_mul_i32 s4, s0, s10 -; GFX1064-NEXT: s_sub_i32 s9, s3, s8 -; GFX1064-NEXT: s_sub_u32 s12, s2, s4 +; GFX1064-NEXT: s_sub_i32 s8, s3, s12 +; GFX1064-NEXT: s_sub_u32 s13, s2, s4 ; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX1064-NEXT: s_subb_u32 s9, s9, s1 -; GFX1064-NEXT: s_sub_u32 s13, s12, s0 -; GFX1064-NEXT: s_subb_u32 s9, s9, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s9, s1 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_subb_u32 s14, s8, s1 +; GFX1064-NEXT: s_sub_u32 s15, s13, s0 +; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1064-NEXT: s_subb_u32 s8, s14, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s8, s1 +; GFX1064-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s15, s0 ; GFX1064-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 -; GFX1064-NEXT: s_cselect_b32 s13, -1, 0 -; GFX1064-NEXT: s_cmp_eq_u32 s9, s1 -; GFX1064-NEXT: s_cselect_b32 s9, s13, s14 -; GFX1064-NEXT: s_add_u32 s13, s10, 1 +; GFX1064-NEXT: s_cmp_eq_u32 s8, s1 +; GFX1064-NEXT: s_cselect_b32 s8, s14, s9 +; GFX1064-NEXT: s_add_u32 s9, s10, 1 ; GFX1064-NEXT: s_addc_u32 s14, s11, 0 ; GFX1064-NEXT: s_add_u32 s15, s10, 2 ; GFX1064-NEXT: s_addc_u32 s16, s11, 0 -; GFX1064-NEXT: s_cmp_lg_u32 s9, 0 -; GFX1064-NEXT: s_cselect_b32 s13, s15, s13 +; GFX1064-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1064-NEXT: s_cselect_b32 s15, s15, s9 ; GFX1064-NEXT: s_cselect_b32 s14, s16, s14 ; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1064-NEXT: s_subb_u32 s3, s3, s8 +; GFX1064-NEXT: s_subb_u32 s3, s3, s12 ; GFX1064-NEXT: s_cmp_ge_u32 s3, s1 ; GFX1064-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s12, s0 +; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 ; GFX1064-NEXT: s_cselect_b32 s5, -1, 0 ; GFX1064-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1064-NEXT: s_cselect_b32 s1, s5, s4 ; GFX1064-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1064-NEXT: s_cselect_b32 s5, s14, s11 -; GFX1064-NEXT: s_cselect_b32 s4, s13, s10 +; GFX1064-NEXT: s_cselect_b32 s4, s15, s10 ; GFX1064-NEXT: s_cbranch_execnz .LBB15_3 ; GFX1064-NEXT: .LBB15_2: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll index 4445383..64d055b 100644 --- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll @@ -271,6 +271,7 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 +; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 ; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -280,6 +281,7 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 +; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31] @@ -297,6 +299,8 @@ define i1 @workgroup_nonzero() { ; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 +; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe +; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -307,6 +311,7 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 +; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -316,6 +321,7 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 +; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0 ; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31] @@ -333,6 +339,8 @@ define i1 @workgroup_nonzero() { ; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; GISEL-GFX12-NEXT: s_wait_alu 0xfffe ; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 +; GISEL-GFX12-NEXT: s_wait_alu 0xfffe +; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0 ; GISEL-GFX12-NEXT: s_wait_alu 0xfffe ; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/Transforms/SCCP/constant-range-struct.ll b/llvm/test/Transforms/SCCP/constant-range-struct.ll index 7a399df..0f45b38 100644 --- a/llvm/test/Transforms/SCCP/constant-range-struct.ll +++ b/llvm/test/Transforms/SCCP/constant-range-struct.ll @@ -25,7 +25,7 @@ true: br label %exit false: - %s.3 = insertvalue {i64, i64} undef, i64 30, 0 + %s.3 = insertvalue {i64, i64} poison, i64 30, 0 %s.4 = insertvalue {i64, i64} %s.3, i64 300, 1 br label %exit @@ -39,14 +39,14 @@ define void @struct1_caller() { ; CHECK-NEXT: [[S:%.*]] = call { i64, i64 } @struct1() ; CHECK-NEXT: [[V1:%.*]] = extractvalue { i64, i64 } [[S]], 0 ; CHECK-NEXT: [[V2:%.*]] = extractvalue { i64, i64 } [[S]], 1 -; CHECK-NEXT: [[T_1:%.*]] = icmp ne i64 [[V1]], 10 -; CHECK-NEXT: call void @use(i1 [[T_1]]) -; CHECK-NEXT: [[T_2:%.*]] = icmp ult i64 [[V1]], 100 -; CHECK-NEXT: call void @use(i1 [[T_2]]) -; CHECK-NEXT: [[T_3:%.*]] = icmp ne i64 [[V2]], 0 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[T_3:%.*]] = icmp eq i64 [[V1]], 20 ; CHECK-NEXT: call void @use(i1 [[T_3]]) -; CHECK-NEXT: [[T_4:%.*]] = icmp ult i64 [[V2]], 301 -; CHECK-NEXT: call void @use(i1 [[T_4]]) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[T_6:%.*]] = icmp eq i64 [[V2]], 300 +; CHECK-NEXT: call void @use(i1 [[T_6]]) ; CHECK-NEXT: ret void ; %s = call {i64, i64} @struct1() @@ -57,10 +57,14 @@ define void @struct1_caller() { call void @use(i1 %t.1) %t.2 = icmp ult i64 %v1, 100 call void @use(i1 %t.2) - %t.3 = icmp ne i64 %v2, 0 + %t.3 = icmp eq i64 %v1, 20 call void @use(i1 %t.3) - %t.4 = icmp ult i64 %v2, 301 + %t.4 = icmp ne i64 %v2, 0 call void @use(i1 %t.4) + %t.5 = icmp ult i64 %v2, 301 + call void @use(i1 %t.5) + %t.6 = icmp eq i64 %v2, 300 + call void @use(i1 %t.6) ret void } @@ -76,7 +80,7 @@ define internal {i64, i64} @struct2() { ; CHECK: exit: ; CHECK-NEXT: [[V1:%.*]] = phi i64 [ 20, [[TRUE]] ], [ 30, [[FALSE]] ] ; CHECK-NEXT: [[V2:%.*]] = phi i64 [ 200, [[TRUE]] ], [ 300, [[FALSE]] ] -; CHECK-NEXT: [[S_1:%.*]] = insertvalue { i64, i64 } undef, i64 [[V1]], 0 +; CHECK-NEXT: [[S_1:%.*]] = insertvalue { i64, i64 } poison, i64 [[V1]], 0 ; CHECK-NEXT: [[S_2:%.*]] = insertvalue { i64, i64 } [[S_1]], i64 [[V2]], 1 ; CHECK-NEXT: ret { i64, i64 } [[S_2]] ; @@ -92,7 +96,7 @@ false: exit: %v1 = phi i64 [ 20, %true ], [ 30, %false ] %v2 = phi i64 [ 200, %true ], [ 300, %false ] - %s.1 = insertvalue {i64, i64} undef, i64 %v1, 0 + %s.1 = insertvalue {i64, i64} poison, i64 %v1, 0 %s.2 = insertvalue {i64, i64} %s.1, i64 %v2, 1 ret {i64, i64} %s.2 } @@ -153,3 +157,40 @@ define void @struct2_caller() { ret void } + +%"phi_type" = type {i64, i64} + +define internal %"phi_type" @test(i32 %input) { +; CHECK-LABEL: @test( +; CHECK-NEXT: br label [[COND_TRUE_I:%.*]] +; CHECK: cond.true.i: +; CHECK-NEXT: br label [[COND_END_I:%.*]] +; CHECK: cond.end.i: +; CHECK-NEXT: ret [[PHI_TYPE:%.*]] poison +; + %cmp.cond = icmp eq i32 %input, 1 + br i1 %cmp.cond, label %cond.true.i, label %cond.false.i + +cond.true.i: + %r1.tmp = insertvalue %"phi_type" poison, i64 1, 0 + %r1.tmp.2 = insertvalue %"phi_type" %r1.tmp, i64 2, 1 + br label %cond.end.i + +cond.false.i: + %r2.tmp = insertvalue %"phi_type" poison, i64 3, 0 + %r2.tmp.2 = insertvalue %"phi_type" %r2.tmp, i64 4, 1 + br label %cond.end.i + +cond.end.i: + %retval = phi %"phi_type" [ %r1.tmp.2, %cond.true.i ], [ %r2.tmp.2, %cond.false.i ] + ret %"phi_type" %retval +} + +define %"phi_type" @test2() { +; CHECK-LABEL: @test2( +; CHECK-NEXT: [[CALL_1:%.*]] = tail call fastcc [[PHI_TYPE:%.*]] @[[TEST:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 noundef 1) +; CHECK-NEXT: ret [[PHI_TYPE]] { i64 1, i64 2 } +; + %call.1 = tail call fastcc noundef %"phi_type" @test(i32 noundef 1) + ret %"phi_type" %call.1 +} diff --git a/llvm/unittests/ADT/BitTest.cpp b/llvm/unittests/ADT/BitTest.cpp index eaed4e1..5b3df91 100644 --- a/llvm/unittests/ADT/BitTest.cpp +++ b/llvm/unittests/ADT/BitTest.cpp @@ -270,6 +270,22 @@ TEST(BitTest, BitWidthConstexpr) { llvm::bit_width_constexpr(std::numeric_limits<uint64_t>::max()) == 64); } +TEST(BitTest, BitCeilConstexpr) { + static_assert(llvm::bit_ceil_constexpr(0u) == 1); + static_assert(llvm::bit_ceil_constexpr(1u) == 1); + static_assert(llvm::bit_ceil_constexpr(2u) == 2); + static_assert(llvm::bit_ceil_constexpr(3u) == 4); + static_assert(llvm::bit_ceil_constexpr(4u) == 4); + static_assert(llvm::bit_ceil_constexpr(5u) == 8); + static_assert(llvm::bit_ceil_constexpr(6u) == 8); + static_assert(llvm::bit_ceil_constexpr(7u) == 8); + static_assert(llvm::bit_ceil_constexpr(8u) == 8); + + static_assert(llvm::bit_ceil_constexpr(255u) == 256); + static_assert(llvm::bit_ceil_constexpr(256u) == 256); + static_assert(llvm::bit_ceil_constexpr(257u) == 512); +} + TEST(BitTest, CountlZero) { uint8_t Z8 = 0; uint16_t Z16 = 0; diff --git a/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td b/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td index 3143ab7..99b22e5 100644 --- a/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td +++ b/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td @@ -220,8 +220,6 @@ def YieldOp : SMTOp<"yield", [ Pure, Terminator, ReturnLike, - ParentOneOf<["smt::SolverOp", "smt::CheckOp", - "smt::ForallOp", "smt::ExistsOp"]>, ]> { let summary = "terminator operation for various regions of SMT operations"; let arguments = (ins Variadic<AnyType>:$values); diff --git a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h index fc69b03..f6353a9 100644 --- a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h +++ b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h @@ -10,6 +10,7 @@ #define MLIR_DIALECT_TRANSFORM_SMTEXTENSION_SMTEXTENSIONOPS_H #include "mlir/Bytecode/BytecodeOpInterface.h" +#include "mlir/Dialect/SMT/IR/SMTOps.h" #include "mlir/Dialect/Transform/IR/TransformDialect.h" #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" #include "mlir/IR/OpDefinition.h" diff --git a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td index b987cb3..9d9783a 100644 --- a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td +++ b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td @@ -16,7 +16,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" def ConstrainParamsOp : Op<Transform_Dialect, "smt.constrain_params", [ DeclareOpInterfaceMethods<TransformOpInterface>, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, - NoTerminator + SingleBlockImplicitTerminator<"::mlir::smt::YieldOp"> ]> { let cppNamespace = [{ mlir::transform::smt }]; @@ -24,14 +24,20 @@ def ConstrainParamsOp : Op<Transform_Dialect, "smt.constrain_params", [ let description = [{ Allows expressing constraints on params using the SMT dialect. - Each Transform dialect param provided as an operand has a corresponding + Each Transform-dialect param provided as an operand has a corresponding argument of SMT-type in the region. The SMT-Dialect ops in the region use - these arguments as operands. + these params-as-SMT-vars as operands, thereby expressing relevant + constraints on their allowed values. + + Computations w.r.t. passed-in params can also be expressed through the + region's SMT-ops. Namely, the constraints express relationships to other + SMT-variables which can then be yielded from the region (with `smt.yield`). The semantics of this op is that all the ops in the region together express a constraint on the params-interpreted-as-smt-vars. The op fails in case the expressed constraint is not satisfiable per SMTLIB semantics. Otherwise the - op succeeds. + op succeeds and any one satisfying assignment is used to map the + SMT-variables yielded in the region to `transform.param`s. --- @@ -42,9 +48,10 @@ def ConstrainParamsOp : Op<Transform_Dialect, "smt.constrain_params", [ }]; let arguments = (ins Variadic<TransformParamTypeInterface>:$params); + let results = (outs Variadic<TransformParamTypeInterface>:$results); let regions = (region SizedRegion<1>:$body); let assemblyFormat = - "`(` $params `)` attr-dict `:` type(operands) $body"; + "`(` $params `)` attr-dict `:` functional-type(operands, results) $body"; let hasVerifier = 1; } diff --git a/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp b/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp index 8e7af05..abc1316 100644 --- a/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp +++ b/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp @@ -8,8 +8,8 @@ #include "mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h" #include "mlir/Dialect/SMT/IR/SMTDialect.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/Dialect/Transform/SMTExtension/SMTExtension.h" +#include "mlir/Dialect/SMT/IR/SMTOps.h" +#include "mlir/Dialect/Transform/IR/TransformTypes.h" using namespace mlir; @@ -23,6 +23,7 @@ using namespace mlir; void transform::smt::ConstrainParamsOp::getEffects( SmallVectorImpl<MemoryEffects::EffectInstance> &effects) { onlyReadsHandle(getParamsMutable(), effects); + producesHandle(getResults(), effects); } DiagnosedSilenceableFailure @@ -37,19 +38,95 @@ transform::smt::ConstrainParamsOp::apply(transform::TransformRewriter &rewriter, // and allow for users to attach their own implementation, which would, // e.g., translate the ops to SMTLIB and hand that over to the user's // favourite solver. This requires changes to the dialect's verifier. - return emitDefiniteFailure() << "op does not have interpreted semantics yet"; + return emitSilenceableFailure(getLoc()) + << "op does not have interpreted semantics yet"; } LogicalResult transform::smt::ConstrainParamsOp::verify() { + auto yieldTerminator = + dyn_cast<mlir::smt::YieldOp>(getRegion().front().back()); + if (!yieldTerminator) + return emitOpError() << "expected '" + << mlir::smt::YieldOp::getOperationName() + << "' as terminator"; + + auto checkTypes = [](size_t idx, Type smtType, StringRef smtDesc, + Type paramType, StringRef paramDesc, + auto *atOp) -> InFlightDiagnostic { + if (!isa<mlir::smt::BoolType, mlir::smt::IntType, mlir::smt::BitVectorType>( + smtType)) + return atOp->emitOpError() << "the type of " << smtDesc << " #" << idx + << " is expected to be either a !smt.bool, a " + "!smt.int, or a !smt.bv"; + + assert(isa<TransformParamTypeInterface>(paramType) && + "ODS specifies params' type should implement param interface"); + if (isa<transform::AnyParamType>(paramType)) + return {}; // No further checks can be done. + + // NB: This cast must succeed as long as the only implementors of + // TransformParamTypeInterface are AnyParamType and ParamType. + Type typeWrappedByParam = cast<ParamType>(paramType).getType(); + + if (isa<mlir::smt::IntType>(smtType)) { + if (!isa<IntegerType>(typeWrappedByParam)) + return atOp->emitOpError() + << "the type of " << smtDesc << " #" << idx + << " is !smt.int though the corresponding " << paramDesc + << " type (" << paramType << ") is not wrapping an integer type"; + } else if (isa<mlir::smt::BoolType>(smtType)) { + auto wrappedIntType = dyn_cast<IntegerType>(typeWrappedByParam); + if (!wrappedIntType || wrappedIntType.getWidth() != 1) + return atOp->emitOpError() + << "the type of " << smtDesc << " #" << idx + << " is !smt.bool though the corresponding " << paramDesc + << " type (" << paramType << ") is not wrapping i1"; + } else if (auto bvSmtType = dyn_cast<mlir::smt::BitVectorType>(smtType)) { + auto wrappedIntType = dyn_cast<IntegerType>(typeWrappedByParam); + if (!wrappedIntType || wrappedIntType.getWidth() != bvSmtType.getWidth()) + return atOp->emitOpError() + << "the type of " << smtDesc << " #" << idx << " is " << smtType + << " though the corresponding " << paramDesc << " type (" + << paramType + << ") is not wrapping an integer type of the same bitwidth"; + } + + return {}; + }; + if (getOperands().size() != getBody().getNumArguments()) return emitOpError( "must have the same number of block arguments as operands"); + for (auto [idx, operandType, blockArgType] : + llvm::enumerate(getOperandTypes(), getBody().getArgumentTypes())) { + InFlightDiagnostic typeCheckResult = + checkTypes(idx, blockArgType, "block arg", operandType, "operand", + /*atOp=*/this); + if (LogicalResult(typeCheckResult).failed()) + return typeCheckResult; + } + for (auto &op : getBody().getOps()) { if (!isa<mlir::smt::SMTDialect>(op.getDialect())) return emitOpError( "ops contained in region should belong to SMT-dialect"); } + if (yieldTerminator->getNumOperands() != getNumResults()) + return yieldTerminator.emitOpError() + << "expected terminator to have as many operands as the parent op " + "has results"; + + for (auto [idx, termOperandType, resultType] : llvm::enumerate( + yieldTerminator->getOperands().getType(), getResultTypes())) { + InFlightDiagnostic typeCheckResult = + checkTypes(idx, termOperandType, "terminator operand", + cast<transform::ParamType>(resultType), "result", + /*atOp=*/&yieldTerminator); + if (LogicalResult(typeCheckResult).failed()) + return typeCheckResult; + } + return success(); } diff --git a/mlir/python/mlir/dialects/transform/smt.py b/mlir/python/mlir/dialects/transform/smt.py index 1f0b7f0..af88fff 100644 --- a/mlir/python/mlir/dialects/transform/smt.py +++ b/mlir/python/mlir/dialects/transform/smt.py @@ -19,6 +19,7 @@ except ImportError as e: class ConstrainParamsOp(ConstrainParamsOp): def __init__( self, + results: Sequence[Type], params: Sequence[transform.AnyParamType], arg_types: Sequence[Type], loc=None, @@ -27,6 +28,7 @@ class ConstrainParamsOp(ConstrainParamsOp): if len(params) != len(arg_types): raise ValueError(f"{params=} not same length as {arg_types=}") super().__init__( + results, params, loc=loc, ip=ip, @@ -36,3 +38,13 @@ class ConstrainParamsOp(ConstrainParamsOp): @property def body(self) -> Block: return self.regions[0].blocks[0] + + +def constrain_params( + results: Sequence[Type], + params: Sequence[transform.AnyParamType], + arg_types: Sequence[Type], + loc=None, + ip=None, +): + return ConstrainParamsOp(results, params, arg_types, loc=loc, ip=ip) diff --git a/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir b/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir index 314b8d4..d91d69a 100644 --- a/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir +++ b/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir @@ -1,11 +1,40 @@ // RUN: mlir-opt %s --transform-interpreter --split-input-file --verify-diagnostics +// CHECK-LABEL: @incorrect terminator +module attributes {transform.with_named_sequence} { + transform.named_sequence @operands_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) { + %param_as_param = transform.param.constant 42 -> !transform.param<i64> + // expected-error@below {{op expected 'smt.yield' as terminator}} + transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () { + ^bb0(%param_as_smt_var: !smt.int): + transform.yield + } + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @operands_not_one_to_one_with_vars +module attributes {transform.with_named_sequence} { + transform.named_sequence @operands_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) { + %param_as_param = transform.param.constant 42 -> !transform.param<i64> + // expected-error@below {{must have the same number of block arguments as operands}} + transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () { + ^bb0(%param_as_smt_var: !smt.int, %param_as_another_smt_var: !smt.int): + } + transform.yield + } +} + +// ----- + // CHECK-LABEL: @constraint_not_using_smt_ops module attributes {transform.with_named_sequence} { transform.named_sequence @constraint_not_using_smt_ops(%arg0: !transform.any_op {transform.readonly}) { %param_as_param = transform.param.constant 42 -> !transform.param<i64> // expected-error@below {{ops contained in region should belong to SMT-dialect}} - transform.smt.constrain_params(%param_as_param) : !transform.param<i64> { + transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () { ^bb0(%param_as_smt_var: !smt.int): %c4 = arith.constant 4 : i32 // This is the kind of thing one might think works: @@ -17,13 +46,90 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-LABEL: @operands_not_one_to_one_with_vars +// CHECK-LABEL: @results_not_one_to_one_with_vars module attributes {transform.with_named_sequence} { - transform.named_sequence @operands_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) { + transform.named_sequence @results_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) { %param_as_param = transform.param.constant 42 -> !transform.param<i64> - // expected-error@below {{must have the same number of block arguments as operands}} - transform.smt.constrain_params(%param_as_param) : !transform.param<i64> { + transform.smt.constrain_params(%param_as_param, %param_as_param) : (!transform.param<i64>, !transform.param<i64>) -> () { ^bb0(%param_as_smt_var: !smt.int, %param_as_another_smt_var: !smt.int): + // expected-error@below {{expected terminator to have as many operands as the parent op has results}} + smt.yield %param_as_smt_var : !smt.int + } + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @non_smt_type_block_args +module attributes {transform.with_named_sequence} { + transform.named_sequence @non_smt_type_block_args(%arg0: !transform.any_op {transform.readonly}) { + %param_as_param = transform.param.constant 42 -> !transform.param<i8> + // expected-error@below {{the type of block arg #0 is expected to be either a !smt.bool, a !smt.int, or a !smt.bv}} + transform.smt.constrain_params(%param_as_param) : (!transform.param<i8>) -> (!transform.param<i8>) { + ^bb0(%param_as_smt_var: !transform.param<i8>): + smt.yield %param_as_smt_var : !transform.param<i8> + } + transform.yield + } +} + + +// ----- + +// CHECK-LABEL: @mismatched_arg_type_bool +module attributes {transform.with_named_sequence} { + transform.named_sequence @mismatched_arg_type_bool(%arg0: !transform.any_op {transform.readonly}) { + %param_as_param = transform.param.constant 42 -> !transform.param<i64> + // expected-error@below {{the type of block arg #0 is !smt.bool though the corresponding operand type ('!transform.param<i64>') is not wrapping i1}} + transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> (!transform.param<i64>) { + ^bb0(%param_as_smt_var: !smt.bool): + smt.yield %param_as_smt_var : !smt.bool + } + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @mismatched_arg_type_bitvector +module attributes {transform.with_named_sequence} { + transform.named_sequence @mismatched_arg_type_bitvector(%arg0: !transform.any_op {transform.readonly}) { + %param_as_param = transform.param.constant 42 -> !transform.param<i64> + // expected-error@below {{the type of block arg #0 is '!smt.bv<8>' though the corresponding operand type ('!transform.param<i64>') is not wrapping an integer type of the same bitwidth}} + transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> (!transform.param<i64>) { + ^bb0(%param_as_smt_var: !smt.bv<8>): + smt.yield %param_as_smt_var : !smt.bv<8> + } + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @mismatched_result_type_bool +module attributes {transform.with_named_sequence} { + transform.named_sequence @mismatched_result_type_bool(%arg0: !transform.any_op {transform.readonly}) { + %param_as_param = transform.param.constant 1 -> !transform.param<i1> + transform.smt.constrain_params(%param_as_param) : (!transform.param<i1>) -> (!transform.param<i64>) { + ^bb0(%param_as_smt_var: !smt.bool): + // expected-error@below {{the type of terminator operand #0 is !smt.bool though the corresponding result type ('!transform.param<i64>') is not wrapping i1}} + smt.yield %param_as_smt_var : !smt.bool + } + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @mismatched_result_type_bitvector +module attributes {transform.with_named_sequence} { + transform.named_sequence @mismatched_result_type_bitvector(%arg0: !transform.any_op {transform.readonly}) { + %param_as_param = transform.param.constant 42 -> !transform.param<i8> + transform.smt.constrain_params(%param_as_param) : (!transform.param<i8>) -> (!transform.param<i64>) { + ^bb0(%param_as_smt_var: !smt.bv<8>): + // expected-error@below {{the type of terminator operand #0 is '!smt.bv<8>' though the corresponding result type ('!transform.param<i64>') is not wrapping an integer type of the same bitwidth}} + smt.yield %param_as_smt_var : !smt.bv<8> } transform.yield } diff --git a/mlir/test/Dialect/Transform/test-smt-extension.mlir b/mlir/test/Dialect/Transform/test-smt-extension.mlir index 29d1517..6cc41dd 100644 --- a/mlir/test/Dialect/Transform/test-smt-extension.mlir +++ b/mlir/test/Dialect/Transform/test-smt-extension.mlir @@ -7,7 +7,7 @@ module attributes {transform.with_named_sequence} { %param_as_param = transform.param.constant 42 -> !transform.param<i64> // CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]]) - transform.smt.constrain_params(%param_as_param) : !transform.param<i64> { + transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () { // CHECK: ^bb{{.*}}(%[[PARAM_AS_SMT_SYMB:.*]]: !smt.int): ^bb0(%param_as_smt_var: !smt.int): // CHECK: %[[C0:.*]] = smt.int.constant 0 @@ -31,18 +31,20 @@ module attributes {transform.with_named_sequence} { // ----- -// CHECK-LABEL: @schedule_with_constraint_on_multiple_params +// CHECK-LABEL: @schedule_with_constraint_on_multiple_params_returning_computed_value module attributes {transform.with_named_sequence} { - transform.named_sequence @schedule_with_constraint_on_multiple_params(%arg0: !transform.any_op {transform.readonly}) { + transform.named_sequence @schedule_with_constraint_on_multiple_params_returning_computed_value(%arg0: !transform.any_op {transform.readonly}) { // CHECK: %[[PARAM_A:.*]] = transform.param.constant %param_a = transform.param.constant 4 -> !transform.param<i64> // CHECK: %[[PARAM_B:.*]] = transform.param.constant - %param_b = transform.param.constant 16 -> !transform.param<i64> + %param_b = transform.param.constant 32 -> !transform.param<i64> // CHECK: transform.smt.constrain_params(%[[PARAM_A]], %[[PARAM_B]]) - transform.smt.constrain_params(%param_a, %param_b) : !transform.param<i64>, !transform.param<i64> { + %divisor = transform.smt.constrain_params(%param_a, %param_b) : (!transform.param<i64>, !transform.param<i64>) -> (!transform.param<i64>) { // CHECK: ^bb{{.*}}(%[[VAR_A:.*]]: !smt.int, %[[VAR_B:.*]]: !smt.int): ^bb0(%var_a: !smt.int, %var_b: !smt.int): + // CHECK: %[[DIV:.*]] = smt.int.div %[[VAR_B]], %[[VAR_A]] + %divisor = smt.int.div %var_b, %var_a // CHECK: %[[C0:.*]] = smt.int.constant 0 %c0 = smt.int.constant 0 // CHECK: %[[REMAINDER:.*]] = smt.int.mod %[[VAR_B]], %[[VAR_A]] @@ -51,8 +53,11 @@ module attributes {transform.with_named_sequence} { %eq = smt.eq %remainder, %c0 : !smt.int // CHECK: smt.assert %[[EQ]] smt.assert %eq + // CHECK: smt.yield %[[DIV]] + smt.yield %divisor : !smt.int } - // NB: from here can rely on that %param_a is a divisor of %param_b + // NB: from here can rely on that %param_a is a divisor of %param_b and + // that the relevant factor, 8, got associated to %divisor. transform.yield } } @@ -63,10 +68,10 @@ module attributes {transform.with_named_sequence} { module attributes {transform.with_named_sequence} { transform.named_sequence @schedule_with_param_as_a_bool(%arg0: !transform.any_op {transform.readonly}) { // CHECK: %[[PARAM_AS_PARAM:.*]] = transform.param.constant - %param_as_param = transform.param.constant true -> !transform.any_param + %param_as_param = transform.param.constant true -> !transform.param<i1> // CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]]) - transform.smt.constrain_params(%param_as_param) : !transform.any_param { + transform.smt.constrain_params(%param_as_param) : (!transform.param<i1>) -> () { // CHECK: ^bb{{.*}}(%[[PARAM_AS_SMT_VAR:.*]]: !smt.bool): ^bb0(%param_as_smt_var: !smt.bool): // CHECK: %[[C0:.*]] = smt.int.constant 0 diff --git a/mlir/test/python/dialects/transform_smt_ext.py b/mlir/test/python/dialects/transform_smt_ext.py index 3692fd9..e28c56f 100644 --- a/mlir/test/python/dialects/transform_smt_ext.py +++ b/mlir/test/python/dialects/transform_smt_ext.py @@ -25,26 +25,44 @@ def run(f): # CHECK-LABEL: TEST: testConstrainParamsOp @run def testConstrainParamsOp(target): - dummy_value = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 42) + c42_attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 42) # CHECK: %[[PARAM_AS_PARAM:.*]] = transform.param.constant - symbolic_value = transform.ParamConstantOp( - transform.AnyParamType.get(), dummy_value + symbolic_value_as_param = transform.ParamConstantOp( + transform.AnyParamType.get(), c42_attr ) # CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]]) constrain_params = transform_smt.ConstrainParamsOp( - [symbolic_value], [smt.IntType.get()] + [], [symbolic_value_as_param], [smt.IntType.get()] ) # CHECK-NEXT: ^bb{{.*}}(%[[PARAM_AS_SMT_SYMB:.*]]: !smt.int): with ir.InsertionPoint(constrain_params.body): + symbolic_value_as_smt_var = constrain_params.body.arguments[0] # CHECK: %[[C0:.*]] = smt.int.constant 0 c0 = smt.IntConstantOp(ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 0)) # CHECK: %[[C43:.*]] = smt.int.constant 43 c43 = smt.IntConstantOp(ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 43)) # CHECK: %[[LB:.*]] = smt.int.cmp le %[[C0]], %[[PARAM_AS_SMT_SYMB]] - lb = smt.IntCmpOp(smt.IntPredicate.le, c0, constrain_params.body.arguments[0]) + lb = smt.IntCmpOp(smt.IntPredicate.le, c0, symbolic_value_as_smt_var) # CHECK: %[[UB:.*]] = smt.int.cmp le %[[PARAM_AS_SMT_SYMB]], %[[C43]] - ub = smt.IntCmpOp(smt.IntPredicate.le, constrain_params.body.arguments[0], c43) + ub = smt.IntCmpOp(smt.IntPredicate.le, symbolic_value_as_smt_var, c43) # CHECK: %[[BOUNDED:.*]] = smt.and %[[LB]], %[[UB]] bounded = smt.AndOp([lb, ub]) # CHECK: smt.assert %[[BOUNDED:.*]] smt.AssertOp(bounded) + smt.YieldOp([]) + + # CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]]) + compute_with_params = transform_smt.ConstrainParamsOp( + [transform.ParamType.get(ir.IntegerType.get_signless(32))], + [symbolic_value_as_param], + [smt.IntType.get()], + ) + # CHECK-NEXT: ^bb{{.*}}(%[[SMT_SYMB:.*]]: !smt.int): + with ir.InsertionPoint(compute_with_params.body): + symbolic_value_as_smt_var = compute_with_params.body.arguments[0] + # CHECK: %[[TWICE:.*]] = smt.int.add %[[SMT_SYMB]], %[[SMT_SYMB]] + twice_symb = smt.IntAddOp( + [symbolic_value_as_smt_var, symbolic_value_as_smt_var] + ) + # CHECK: smt.yield %[[TWICE]] + smt.YieldOp([twice_symb]) |