diff options
author | Xiaofeng Tian <110771974+txff99@users.noreply.github.com> | 2024-09-11 14:00:54 -0700 |
---|---|---|
committer | Vitaly Buka <vitalybuka@google.com> | 2024-09-11 14:00:54 -0700 |
commit | 8ee4ba92e1a85fd220caf2f83d5d86c334d432db (patch) | |
tree | 847bf2e4f29d84e1369a9d4ef2d1e69c637a36de | |
parent | 01006d0bc93e5c6a6953ea8874eadb56ec719399 (diff) | |
parent | c2b93e0671d8cfd6b1a24c6e1d7be290125b8974 (diff) | |
download | llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.zip llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.tar.gz llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.tar.bz2 |
[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912
Created using spr 1.3.4
[skip ci]
288 files changed, 10031 insertions, 7117 deletions
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp index 8116db5..98c592d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp @@ -187,14 +187,14 @@ void lambda_value_reference_auxiliary_var(T&& t) { namespace deleted_functions { template <typename T> -void f(T &&) = delete; +void f(T &&t) = delete; struct S { template <typename T> - S(T &&) = delete; + S(T &&t) = delete; template <typename T> - void operator&(T &&) = delete; + void operator&(T &&t) = delete; }; } // namespace deleted_functions diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp index 2f119fe..ec617ee 100644 --- a/clang/lib/CodeGen/Targets/AArch64.cpp +++ b/clang/lib/CodeGen/Targets/AArch64.cpp @@ -500,7 +500,7 @@ bool AArch64SwiftABIInfo::isLegalVectorType(CharUnits VectorSize, bool AArch64ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { // For the soft-float ABI variant, no types are considered to be homogeneous // aggregates. - if (Kind == AArch64ABIKind::AAPCSSoft) + if (isSoftFloat()) return false; // Homogeneous aggregates for AAPCS64 must have base types of a floating @@ -555,8 +555,8 @@ RValue AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty, BaseTy = ArrTy->getElementType(); NumRegs = ArrTy->getNumElements(); } - bool IsFPR = Kind != AArch64ABIKind::AAPCSSoft && - (BaseTy->isFloatingPointTy() || BaseTy->isVectorTy()); + bool IsFPR = + !isSoftFloat() && (BaseTy->isFloatingPointTy() || BaseTy->isVectorTy()); // The AArch64 va_list type and handling is specified in the Procedure Call // Standard, section B.4: diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 071e64f..da7bbf8 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -525,6 +525,16 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .addArraySubscriptOperators() .completeDefinition(); }); + + Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "StructuredBuffer") + .addSimpleTemplateParams(*SemaPtr, {"element_type"}) + .Record; + onCompletion(Decl, [this](CXXRecordDecl *Decl) { + setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, + ResourceKind::TypedBuffer, /*IsROV=*/false) + .addArraySubscriptOperators() + .completeDefinition(); + }); } void HLSLExternalSemaSource::onCompletion(CXXRecordDecl *Record, diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index 49bbff1..f48b2fd 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -143,6 +143,16 @@ bool isReturnValueRefCounted(const clang::FunctionDecl *F) { return false; } +std::optional<bool> isUncounted(const QualType T) { + if (auto *Subst = dyn_cast<SubstTemplateTypeParmType>(T)) { + if (auto *Decl = Subst->getAssociatedDecl()) { + if (isRefType(safeGetName(Decl))) + return false; + } + } + return isUncounted(T->getAsCXXRecordDecl()); +} + std::optional<bool> isUncounted(const CXXRecordDecl* Class) { // Keep isRefCounted first as it's cheaper. @@ -231,11 +241,9 @@ bool isSingleton(const FunctionDecl *F) { if (!MethodDecl->isStatic()) return false; } - const auto &Name = safeGetName(F); - std::string SingletonStr = "singleton"; - auto index = Name.find(SingletonStr); - return index != std::string::npos && - index == Name.size() - SingletonStr.size(); + const auto &NameStr = safeGetName(F); + StringRef Name = NameStr; // FIXME: Make safeGetName return StringRef. + return Name == "singleton" || Name.ends_with("Singleton"); } // We only care about statements so let's use the simple @@ -397,6 +405,7 @@ public: return true; if (Name == "WTFCrashWithInfo" || Name == "WTFBreakpointTrap" || + Name == "WTFReportBacktrace" || Name == "WTFCrashWithSecurityImplication" || Name == "WTFCrash" || Name == "WTFReportAssertionFailure" || Name == "isMainThread" || Name == "isMainThreadOrGCThread" || Name == "isMainRunLoop" || diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h index ec1db1c..2932e62 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h @@ -20,6 +20,7 @@ class CXXMethodDecl; class CXXRecordDecl; class Decl; class FunctionDecl; +class QualType; class Stmt; class Type; @@ -44,6 +45,10 @@ bool isRefCounted(const clang::CXXRecordDecl *Class); /// \returns true if \p Class is ref-countable AND not ref-counted, false if /// not, std::nullopt if inconclusive. +std::optional<bool> isUncounted(const clang::QualType T); + +/// \returns true if \p Class is ref-countable AND not ref-counted, false if +/// not, std::nullopt if inconclusive. std::optional<bool> isUncounted(const clang::CXXRecordDecl* Class); /// \returns true if \p T is either a raw pointer or reference to an uncounted diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp index 704c082..81c2434 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp @@ -87,8 +87,7 @@ public: } auto *E = MemberCallExpr->getImplicitObjectArgument(); QualType ArgType = MemberCallExpr->getObjectType(); - std::optional<bool> IsUncounted = - isUncounted(ArgType->getAsCXXRecordDecl()); + std::optional<bool> IsUncounted = isUncounted(ArgType); if (IsUncounted && *IsUncounted && !isPtrOriginSafe(E)) reportBugOnThis(E); } diff --git a/clang/lib/Tooling/Transformer/Stencil.cpp b/clang/lib/Tooling/Transformer/Stencil.cpp index bc4fa6e..223fb5a 100644 --- a/clang/lib/Tooling/Transformer/Stencil.cpp +++ b/clang/lib/Tooling/Transformer/Stencil.cpp @@ -50,7 +50,13 @@ static Error printNode(StringRef Id, const MatchFinder::MatchResult &Match, auto NodeOrErr = getNode(Match.Nodes, Id); if (auto Err = NodeOrErr.takeError()) return Err; - NodeOrErr->print(Os, PrintingPolicy(Match.Context->getLangOpts())); + const PrintingPolicy PP(Match.Context->getLangOpts()); + if (const auto *ND = NodeOrErr->get<NamedDecl>()) { + // For NamedDecls, we can do a better job than printing the whole thing. + ND->getNameForDiagnostic(Os, PP, false); + } else { + NodeOrErr->print(Os, PP); + } *Result += Output; return Error::success(); } diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl new file mode 100644 index 0000000..42991d8 --- /dev/null +++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl @@ -0,0 +1,64 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s
+
+
+// This test tests two different AST generations. The "EMPTY" test mode verifies
+// the AST generated by forward declaration of the HLSL types which happens on
+// initializing the HLSL external AST with an AST Context.
+
+// The non-empty mode has a use that requires the StructuredBuffer type be complete,
+// which results in the AST being populated by the external AST source. That
+// case covers the full implementation of the template declaration and the
+// instantiated specialization.
+
+// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit StructuredBuffer
+// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element_type
+// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class StructuredBuffer
+// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+
+// There should be no more occurrances of StructuredBuffer
+// EMPTY-NOT: StructuredBuffer
+
+#ifndef EMPTY
+
+StructuredBuffer<float> Buffer;
+
+#endif
+
+// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit StructuredBuffer
+// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element_type
+// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class StructuredBuffer definition
+
+// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h 'element_type *'
+// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit UAV
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
+
+// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
+// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const StructuredBuffer<element_type>' lvalue implicit this
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
+// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
+// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'StructuredBuffer<element_type>' lvalue implicit this
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
+// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class StructuredBuffer definition
+
+// CHECK: TemplateArgument type 'float'
+// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
+// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'float *'
+// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit UAV
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp index a98c6eb9c..97efb35 100644 --- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp @@ -6,6 +6,7 @@ void WTFBreakpointTrap(); void WTFCrashWithInfo(int, const char*, const char*, int); void WTFReportAssertionFailure(const char* file, int line, const char* function, const char* assertion); +void WTFReportBacktrace(void); void WTFCrash(void); void WTFCrashWithSecurityImplication(void); @@ -334,6 +335,7 @@ public: } unsigned trivial60() { return ObjectWithNonTrivialDestructor { 5 }.value(); } unsigned trivial61() { return DerivedNumber('7').value(); } + void trivial62() { WTFReportBacktrace(); } static RefCounted& singleton() { static RefCounted s_RefCounted; @@ -341,6 +343,12 @@ public: return s_RefCounted; } + static RefCounted& otherSingleton() { + static RefCounted s_RefCounted; + s_RefCounted.ref(); + return s_RefCounted; + } + Number nonTrivial1() { return Number(3) + Number(4); } Number nonTrivial2() { return Number { 0.3 }; } int nonTrivial3() { return v ? otherFunction() : 0; } @@ -506,9 +514,12 @@ public: getFieldTrivial().trivial59(); // no-warning getFieldTrivial().trivial60(); // no-warning getFieldTrivial().trivial61(); // no-warning + getFieldTrivial().trivial62(); // no-warning RefCounted::singleton().trivial18(); // no-warning RefCounted::singleton().someFunction(); // no-warning + RefCounted::otherSingleton().trivial18(); // no-warning + RefCounted::otherSingleton().someFunction(); // no-warning getFieldTrivial().recursiveTrivialFunction(7); // no-warning getFieldTrivial().recursiveComplexFunction(9); diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm new file mode 100644 index 0000000..db0c5b1 --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm @@ -0,0 +1,26 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s +// expected-no-diagnostics + +#import "mock-types.h" +#import "mock-system-header.h" +#import "../../Inputs/system-header-simulator-for-objc-dealloc.h" + +@interface Foo : NSObject + +@property (nonatomic, readonly) RefPtr<RefCountable> countable; + +- (void)execute; +- (RefPtr<RefCountable>)_protectedRefCountable; +@end + +@implementation Foo + +- (void)execute { + self._protectedRefCountable->method(); +} + +- (RefPtr<RefCountable>)_protectedRefCountable { + return _countable; +} + +@end diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl new file mode 100644 index 0000000..16b7295 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+
+StructuredBuffer<float> Buffer1;
+StructuredBuffer<vector<float, 4> > BufferArray[4];
+
+StructuredBuffer<float> Buffer2 : register(u3);
+StructuredBuffer<vector<float, 4> > BufferArray2[4] : register(u4);
+
+StructuredBuffer<float> Buffer3 : register(u3, space1);
+StructuredBuffer<vector<float, 4> > BufferArray3[4] : register(u4, space1);
+
+[numthreads(1,1,1)]
+void main() {
+}
+
+// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]}
+// CHECK-DAG: ![[Single]] = !{ptr @"?Buffer1@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[Array]] = !{ptr @"?BufferArray@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[SingleAllocated]] = !{ptr @"?Buffer2@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 0}
+// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @"?BufferArray2@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 0}
+// CHECK-DAG: ![[SingleSpace]] = !{ptr @"?Buffer3@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 1}
+// CHECK-DAG: ![[ArraySpace]] = !{ptr @"?BufferArray3@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 1}
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl new file mode 100644 index 0000000..34019e5 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
+
+StructuredBuffer<float> Buf;
+
+// CHECK: define linkonce_odr noundef ptr @"??0?$StructuredBuffer@M@hlsl@@QAA@XZ"
+// CHECK-NEXT: entry:
+
+// CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1)
+// CHECK: store ptr %[[HandleRes]], ptr %h, align 4
+
+// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1)
+// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl new file mode 100644 index 0000000..8ddf8a6 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl @@ -0,0 +1,52 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s
+
+StructuredBuffer<int16_t> BufI16;
+StructuredBuffer<uint16_t> BufU16;
+StructuredBuffer<int> BufI32;
+StructuredBuffer<uint> BufU32;
+StructuredBuffer<int64_t> BufI64;
+StructuredBuffer<uint64_t> BufU64;
+StructuredBuffer<half> BufF16;
+StructuredBuffer<float> BufF32;
+StructuredBuffer<double> BufF64;
+StructuredBuffer< vector<int16_t, 4> > BufI16x4;
+StructuredBuffer< vector<uint, 3> > BufU32x3;
+StructuredBuffer<half2> BufF16x2;
+StructuredBuffer<float3> BufF32x3;
+// TODO: StructuredBuffer<snorm half> BufSNormF16; -> 11
+// TODO: StructuredBuffer<unorm half> BufUNormF16; -> 12
+// TODO: StructuredBuffer<snorm float> BufSNormF32; -> 13
+// TODO: StructuredBuffer<unorm float> BufUNormF32; -> 14
+// TODO: StructuredBuffer<snorm double> BufSNormF64; -> 15
+// TODO: StructuredBuffer<unorm double> BufUNormF64; -> 16
+
+[numthreads(1,1,1)]
+void main(int GI : SV_GroupIndex) {
+ BufI16[GI] = 0;
+ BufU16[GI] = 0;
+ BufI32[GI] = 0;
+ BufU32[GI] = 0;
+ BufI64[GI] = 0;
+ BufU64[GI] = 0;
+ BufF16[GI] = 0;
+ BufF32[GI] = 0;
+ BufF64[GI] = 0;
+ BufI16x4[GI] = 0;
+ BufU32x3[GI] = 0;
+ BufF16x2[GI] = 0;
+ BufF32x3[GI] = 0;
+}
+
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16@@3V?$StructuredBuffer@F@hlsl@@A", i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU16@@3V?$StructuredBuffer@G@hlsl@@A", i32 10, i32 3,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI32@@3V?$StructuredBuffer@H@hlsl@@A", i32 10, i32 4,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32@@3V?$StructuredBuffer@I@hlsl@@A", i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI64@@3V?$StructuredBuffer@J@hlsl@@A", i32 10, i32 6,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU64@@3V?$StructuredBuffer@K@hlsl@@A", i32 10, i32 7,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16@@3V?$StructuredBuffer@$f16@@hlsl@@A", i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF64@@3V?$StructuredBuffer@N@hlsl@@A", i32 10, i32 10,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16x4@@3V?$StructuredBuffer@T?$__vector@F$03@__clang@@@hlsl@@A", i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32x3@@3V?$StructuredBuffer@T?$__vector@I$02@__clang@@@hlsl@@A", i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16x2@@3V?$StructuredBuffer@T?$__vector@$f16@$01@__clang@@@hlsl@@A", i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32x3@@3V?$StructuredBuffer@T?$__vector@M$02@__clang@@@hlsl@@A", i32 10, i32 9,
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl new file mode 100644 index 0000000..9bd885d --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s
+
+StructuredBuffer<int> In;
+StructuredBuffer<int> Out;
+
+[numthreads(1,1,1)]
+void main(unsigned GI : SV_GroupIndex) {
+ Out[GI] = In[GI];
+}
+
+// Even at -O0 the subscript operators get inlined. The -O0 IR is a bit messy
+// and confusing to follow so the match here is pretty weak.
+
+// CHECK: define internal void @"?main@@YAXI@Z"
+// CHECK-NOT: call
+// CHECK: ret void
diff --git a/clang/test/CodeGenHLSL/loops/unroll.hlsl b/clang/test/CodeGenHLSL/loops/unroll.hlsl index 7389f21..efca074 100644 --- a/clang/test/CodeGenHLSL/loops/unroll.hlsl +++ b/clang/test/CodeGenHLSL/loops/unroll.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \ // RUN: dxil-pc-shadermodel6.3-library -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s /*** for ***/ @@ -35,8 +35,8 @@ void for_nested_one_unroll_enable() for( int j = 0; j < 10; ++j) s += i + j; } -// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE:.*]] -// CHECK-NOT: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_1_ENABLE:.*]] +// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE_INNER:.*]] +// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE_OUTER:.*]] } void for_nested_two_unroll_enable() @@ -111,20 +111,26 @@ void do_enable() } -// CHECK: ![[FOR_DISTINCT]] = distinct !{![[FOR_DISTINCT]], ![[FOR_COUNT:.*]]} -// CHECK: ![[FOR_COUNT]] = !{!"llvm.loop.unroll.count", i32 8} -// CHECK: ![[FOR_DISABLE]] = distinct !{![[FOR_DISABLE]], ![[DISABLE:.*]]} -// CHECK: ![[DISABLE]] = !{!"llvm.loop.unroll.disable"} -// CHECK: ![[FOR_ENABLE]] = distinct !{![[FOR_ENABLE]], ![[ENABLE:.*]]} -// CHECK: ![[ENABLE]] = !{!"llvm.loop.unroll.enable"} -// CHECK: ![[FOR_NESTED_ENABLE]] = distinct !{![[FOR_NESTED_ENABLE]], ![[ENABLE]]} -// CHECK: ![[FOR_NESTED2_ENABLE]] = distinct !{![[FOR_NESTED2_ENABLE]], ![[ENABLE]]} -// CHECK: ![[FOR_NESTED2_1_ENABLE]] = distinct !{![[FOR_NESTED2_1_ENABLE]], ![[ENABLE]]} -// CHECK: ![[WHILE_DISTINCT]] = distinct !{![[WHILE_DISTINCT]], ![[WHILE_COUNT:.*]]} -// CHECK: ![[WHILE_COUNT]] = !{!"llvm.loop.unroll.count", i32 4} -// CHECK: ![[WHILE_DISABLE]] = distinct !{![[WHILE_DISABLE]], ![[DISABLE]]} -// CHECK: ![[WHILE_ENABLE]] = distinct !{![[WHILE_ENABLE]], ![[ENABLE]]} -// CHECK: ![[DO_DISTINCT]] = distinct !{![[DO_DISTINCT]], ![[DO_COUNT:.*]]} -// CHECK: ![[DO_COUNT]] = !{!"llvm.loop.unroll.count", i32 16} -// CHECK: ![[DO_DISABLE]] = distinct !{![[DO_DISABLE]], ![[DISABLE]]} -// CHECK: ![[DO_ENABLE]] = distinct !{![[DO_ENABLE]], ![[ENABLE]]} +// CHECK-DAG: [[MUST_PROGRESS:.*]] = !{!"llvm.loop.mustprogress"} +// CHECK-DAG: [[DISABLE:.*]] = !{!"llvm.loop.unroll.disable"} +// CHECK-DAG: [[FOR_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 8} +// CHECK-DAG: [[ENABLE:.*]] = !{!"llvm.loop.unroll.enable"} +// CHECK-DAG: [[WHILE_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 4} +// CHECK-DAG: [[DO_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 16} + +// CHECK-DAG: ![[FOR_DISTINCT]] = distinct !{![[FOR_DISTINCT]], [[MUST_PROGRESS]], [[FOR_COUNT]]} +// CHECK-DAG: ![[FOR_DISABLE]] = distinct !{![[FOR_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]} +// CHECK-DAG: ![[FOR_ENABLE]] = distinct !{![[FOR_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]} + +// CHECK-DAG: ![[FOR_NESTED_ENABLE_INNER]] = distinct !{![[FOR_NESTED_ENABLE_INNER]], [[MUST_PROGRESS]]} +// CHECK-DAG: ![[FOR_NESTED_ENABLE_OUTER]] = distinct !{![[FOR_NESTED_ENABLE_OUTER]], [[MUST_PROGRESS]], [[ENABLE]]} +// CHECK-DAG: ![[FOR_NESTED2_ENABLE]] = distinct !{![[FOR_NESTED2_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]} +// CHECK-DAG: ![[FOR_NESTED2_1_ENABLE]] = distinct !{![[FOR_NESTED2_1_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]} +// CHECK-DAG: ![[WHILE_DISTINCT]] = distinct !{![[WHILE_DISTINCT]], [[MUST_PROGRESS]], [[WHILE_COUNT]]} + +// CHECK-DAG: ![[WHILE_DISABLE]] = distinct !{![[WHILE_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]} +// CHECK-DAG: ![[WHILE_ENABLE]] = distinct !{![[WHILE_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]} +// CHECK-DAG: ![[DO_DISTINCT]] = distinct !{![[DO_DISTINCT]], [[MUST_PROGRESS]], [[DO_COUNT]]} + +// CHECK-DAG: ![[DO_DISABLE]] = distinct !{![[DO_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]} +// CHECK-DAG: ![[DO_ENABLE]] = distinct !{![[DO_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]} diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl new file mode 100644 index 0000000..2450941 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s
+
+typedef vector<float, 3> float3;
+
+StructuredBuffer<float3> Buffer;
+
+// expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}}
+// expected-note@*:* {{template declaration from hidden source: template <class element_type> class StructuredBuffer}}
+StructuredBuffer BufferErr1;
+
+// expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}}
+// expected-note@*:* {{template declaration from hidden source: template <class element_type> class StructuredBuffer}}
+StructuredBuffer<> BufferErr2;
+
+[numthreads(1,1,1)]
+void main() {
+ (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer<vector<float, 3> >'}}
+ // expected-note@* {{implicitly declared private here}}
+}
diff --git a/clang/unittests/Tooling/StencilTest.cpp b/clang/unittests/Tooling/StencilTest.cpp index 26257cf..445912a 100644 --- a/clang/unittests/Tooling/StencilTest.cpp +++ b/clang/unittests/Tooling/StencilTest.cpp @@ -565,6 +565,28 @@ TEST_F(StencilTest, DescribeAnonNamespaceType) { HasValue(std::string(Expected))); } +TEST_F(StencilTest, DescribeFunction) { + std::string Snippet = "int F(); F();"; + std::string Expected = "F"; + auto StmtMatch = matchStmt(Snippet, callExpr(callee(namedDecl().bind("fn")))); + ASSERT_TRUE(StmtMatch); + EXPECT_THAT_EXPECTED(describe("fn")->eval(StmtMatch->Result), + HasValue(std::string(Expected))); +} + +TEST_F(StencilTest, DescribeImplicitOperator) { + std::string Snippet = "struct Tag {}; [](Tag){};"; + std::string Expected = "operator()"; + auto StmtMatch = matchStmt( + Snippet, + stmt(hasDescendant( + cxxMethodDecl(hasParameter(0, hasType(namedDecl(hasName("Tag"))))) + .bind("fn")))); + ASSERT_TRUE(StmtMatch); + EXPECT_THAT_EXPECTED(describe("fn")->eval(StmtMatch->Result), + HasValue(std::string(Expected))); +} + TEST_F(StencilTest, RunOp) { StringRef Id = "id"; auto SimpleFn = [Id](const MatchResult &R) { diff --git a/clang/utils/TableGen/ASTTableGen.cpp b/clang/utils/TableGen/ASTTableGen.cpp index 54288ff..4734477 100644 --- a/clang/utils/TableGen/ASTTableGen.cpp +++ b/clang/utils/TableGen/ASTTableGen.cpp @@ -31,7 +31,8 @@ llvm::StringRef clang::tblgen::HasProperties::getName() const { } } -static StringRef removeExpectedNodeNameSuffix(Record *node, StringRef suffix) { +static StringRef removeExpectedNodeNameSuffix(const Record *node, + StringRef suffix) { StringRef nodeName = node->getName(); if (!nodeName.ends_with(suffix)) { PrintFatalError(node->getLoc(), @@ -105,8 +106,7 @@ static void visitASTNodeRecursive(ASTNode node, ASTNode base, } } -static void visitHierarchy(RecordKeeper &records, - StringRef nodeClassName, +static void visitHierarchy(const RecordKeeper &records, StringRef nodeClassName, ASTNodeHierarchyVisitor<ASTNode> visit) { // Check for the node class, just as a basic correctness check. if (!records.getClass(nodeClassName)) { @@ -114,13 +114,10 @@ static void visitHierarchy(RecordKeeper &records, + nodeClassName); } - // Find all the nodes in the hierarchy. - auto nodes = records.getAllDerivedDefinitions(nodeClassName); - - // Derive the child map. + // Derive the child map for all nodes in the hierarchy. ChildMap hierarchy; ASTNode root; - for (ASTNode node : nodes) { + for (ASTNode node : records.getAllDerivedDefinitions(nodeClassName)) { if (auto base = node.getBase()) hierarchy.insert(std::make_pair(base, node)); else if (root) @@ -136,8 +133,8 @@ static void visitHierarchy(RecordKeeper &records, visitASTNodeRecursive(root, ASTNode(), hierarchy, visit); } -void clang::tblgen::visitASTNodeHierarchyImpl(RecordKeeper &records, - StringRef nodeClassName, - ASTNodeHierarchyVisitor<ASTNode> visit) { +void clang::tblgen::visitASTNodeHierarchyImpl( + const RecordKeeper &records, StringRef nodeClassName, + ASTNodeHierarchyVisitor<ASTNode> visit) { visitHierarchy(records, nodeClassName, visit); } diff --git a/clang/utils/TableGen/ASTTableGen.h b/clang/utils/TableGen/ASTTableGen.h index 41f78a6..143d779 100644 --- a/clang/utils/TableGen/ASTTableGen.h +++ b/clang/utils/TableGen/ASTTableGen.h @@ -87,18 +87,18 @@ namespace clang { namespace tblgen { class WrappedRecord { - llvm::Record *Record; + const llvm::Record *Record; protected: - WrappedRecord(llvm::Record *record = nullptr) : Record(record) {} + WrappedRecord(const llvm::Record *record = nullptr) : Record(record) {} - llvm::Record *get() const { + const llvm::Record *get() const { assert(Record && "accessing null record"); return Record; } public: - llvm::Record *getRecord() const { return Record; } + const llvm::Record *getRecord() const { return Record; } explicit operator bool() const { return Record != nullptr; } @@ -144,7 +144,7 @@ class HasProperties : public WrappedRecord { public: static constexpr llvm::StringRef ClassName = HasPropertiesClassName; - HasProperties(llvm::Record *record = nullptr) : WrappedRecord(record) {} + HasProperties(const llvm::Record *record = nullptr) : WrappedRecord(record) {} llvm::StringRef getName() const; @@ -157,7 +157,7 @@ public: /// in one of Clang's AST hierarchies. class ASTNode : public HasProperties { public: - ASTNode(llvm::Record *record = nullptr) : HasProperties(record) {} + ASTNode(const llvm::Record *record = nullptr) : HasProperties(record) {} llvm::StringRef getName() const { return get()->getName(); @@ -180,7 +180,7 @@ public: class DeclNode : public ASTNode { public: - DeclNode(llvm::Record *record = nullptr) : ASTNode(record) {} + DeclNode(const llvm::Record *record = nullptr) : ASTNode(record) {} llvm::StringRef getId() const; std::string getClassName() const; @@ -202,7 +202,7 @@ public: class TypeNode : public ASTNode { public: - TypeNode(llvm::Record *record = nullptr) : ASTNode(record) {} + TypeNode(const llvm::Record *record = nullptr) : ASTNode(record) {} llvm::StringRef getId() const; llvm::StringRef getClassName() const; @@ -224,7 +224,7 @@ public: class StmtNode : public ASTNode { public: - StmtNode(llvm::Record *record = nullptr) : ASTNode(record) {} + StmtNode(const llvm::Record *record = nullptr) : ASTNode(record) {} std::string getId() const; llvm::StringRef getClassName() const; @@ -247,7 +247,7 @@ public: /// The type of a property. class PropertyType : public WrappedRecord { public: - PropertyType(llvm::Record *record = nullptr) : WrappedRecord(record) {} + PropertyType(const llvm::Record *record = nullptr) : WrappedRecord(record) {} /// Is this a generic specialization (i.e. `Array<T>` or `Optional<T>`)? bool isGenericSpecialization() const { @@ -331,7 +331,7 @@ public: /// A rule for returning the kind of a type. class TypeKindRule : public WrappedRecord { public: - TypeKindRule(llvm::Record *record = nullptr) : WrappedRecord(record) {} + TypeKindRule(const llvm::Record *record = nullptr) : WrappedRecord(record) {} /// Return the type to which this applies. PropertyType getParentType() const { @@ -361,7 +361,7 @@ public: /// An implementation case of a property type. class TypeCase : public HasProperties { public: - TypeCase(llvm::Record *record = nullptr) : HasProperties(record) {} + TypeCase(const llvm::Record *record = nullptr) : HasProperties(record) {} /// Return the name of this case. llvm::StringRef getCaseName() const { @@ -381,7 +381,7 @@ public: /// A property of an AST node. class Property : public WrappedRecord { public: - Property(llvm::Record *record = nullptr) : WrappedRecord(record) {} + Property(const llvm::Record *record = nullptr) : WrappedRecord(record) {} /// Return the name of this property. llvm::StringRef getName() const { @@ -417,7 +417,8 @@ public: /// a value (which is actually done when writing the value out). class ReadHelperRule : public WrappedRecord { public: - ReadHelperRule(llvm::Record *record = nullptr) : WrappedRecord(record) {} + ReadHelperRule(const llvm::Record *record = nullptr) + : WrappedRecord(record) {} /// Return the class for which this is a creation rule. /// Should never be abstract. @@ -437,7 +438,7 @@ public: /// A rule for how to create an AST node from its properties. class CreationRule : public WrappedRecord { public: - CreationRule(llvm::Record *record = nullptr) : WrappedRecord(record) {} + CreationRule(const llvm::Record *record = nullptr) : WrappedRecord(record) {} /// Return the class for which this is a creation rule. /// Should never be abstract. @@ -457,7 +458,7 @@ public: /// A rule which overrides the standard rules for serializing an AST node. class OverrideRule : public WrappedRecord { public: - OverrideRule(llvm::Record *record = nullptr) : WrappedRecord(record) {} + OverrideRule(const llvm::Record *record = nullptr) : WrappedRecord(record) {} /// Return the class for which this is an override rule. /// Should never be abstract. @@ -483,12 +484,12 @@ template <class NodeClass> using ASTNodeHierarchyVisitor = llvm::function_ref<void(NodeClass node, NodeClass base)>; -void visitASTNodeHierarchyImpl(llvm::RecordKeeper &records, +void visitASTNodeHierarchyImpl(const llvm::RecordKeeper &records, llvm::StringRef nodeClassName, ASTNodeHierarchyVisitor<ASTNode> visit); template <class NodeClass> -void visitASTNodeHierarchy(llvm::RecordKeeper &records, +void visitASTNodeHierarchy(const llvm::RecordKeeper &records, ASTNodeHierarchyVisitor<NodeClass> visit) { visitASTNodeHierarchyImpl(records, NodeClass::getTableGenNodeClassName(), [visit](ASTNode node, ASTNode base) { diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp index 94f12a0..4ae7600 100644 --- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp +++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp @@ -345,7 +345,7 @@ void EmitBuiltin(llvm::raw_ostream &OS, const Record *Builtin) { } } // namespace -void clang::EmitClangBuiltins(llvm::RecordKeeper &Records, +void clang::EmitClangBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS) { emitSourceFileHeader("List of builtins that Clang recognizes", OS); diff --git a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp index aee7d38..1a2503d 100644 --- a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp +++ b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp @@ -20,16 +20,16 @@ using namespace llvm; -void clang::EmitClangCommentCommandInfo(RecordKeeper &Records, +void clang::EmitClangCommentCommandInfo(const RecordKeeper &Records, raw_ostream &OS) { emitSourceFileHeader("A list of commands useable in documentation comments", OS, Records); OS << "namespace {\n" "const CommandInfo Commands[] = {\n"; - std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Command"); + ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Command"); for (size_t i = 0, e = Tags.size(); i != e; ++i) { - Record &Tag = *Tags[i]; + const Record &Tag = *Tags[i]; OS << " { " << "\"" << Tag.getValueAsString("Name") << "\", " << "\"" << Tag.getValueAsString("EndCommandName") << "\", " << i << ", " @@ -62,7 +62,7 @@ void clang::EmitClangCommentCommandInfo(RecordKeeper &Records, std::vector<StringMatcher::StringPair> Matches; for (size_t i = 0, e = Tags.size(); i != e; ++i) { - Record &Tag = *Tags[i]; + const Record &Tag = *Tags[i]; std::string Name = std::string(Tag.getValueAsString("Name")); std::string Return; raw_string_ostream(Return) << "return &Commands[" << i << "];"; @@ -112,7 +112,7 @@ static std::string MangleName(StringRef Str) { return Mangled; } -void clang::EmitClangCommentCommandList(RecordKeeper &Records, +void clang::EmitClangCommentCommandList(const RecordKeeper &Records, raw_ostream &OS) { emitSourceFileHeader("A list of commands useable in documentation comments", OS, Records); @@ -121,9 +121,9 @@ void clang::EmitClangCommentCommandList(RecordKeeper &Records, << "# define COMMENT_COMMAND(NAME)\n" << "#endif\n"; - std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Command"); + ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Command"); for (size_t i = 0, e = Tags.size(); i != e; ++i) { - Record &Tag = *Tags[i]; + const Record &Tag = *Tags[i]; std::string MangledName = MangleName(Tag.getValueAsString("Name")); OS << "COMMENT_COMMAND(" << MangledName << ")\n"; diff --git a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp index f1cd9af..bd75b3f 100644 --- a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp +++ b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp @@ -46,21 +46,17 @@ static bool translateCodePointToUTF8(unsigned CodePoint, return true; } -void clang::EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records, - raw_ostream &OS) { - std::vector<Record *> Tags = Records.getAllDerivedDefinitions("NCR"); +void clang::EmitClangCommentHTMLNamedCharacterReferences( + const RecordKeeper &Records, raw_ostream &OS) { std::vector<StringMatcher::StringPair> NameToUTF8; SmallString<32> CLiteral; - for (std::vector<Record *>::iterator I = Tags.begin(), E = Tags.end(); - I != E; ++I) { - Record &Tag = **I; - std::string Spelling = std::string(Tag.getValueAsString("Spelling")); - uint64_t CodePoint = Tag.getValueAsInt("CodePoint"); + for (const Record *Tag : Records.getAllDerivedDefinitions("NCR")) { + std::string Spelling = std::string(Tag->getValueAsString("Spelling")); + uint64_t CodePoint = Tag->getValueAsInt("CodePoint"); CLiteral.clear(); CLiteral.append("return "); if (!translateCodePointToUTF8(CodePoint, CLiteral)) { - SrcMgr.PrintMessage(Tag.getLoc().front(), - SourceMgr::DK_Error, + SrcMgr.PrintMessage(Tag->getLoc().front(), SourceMgr::DK_Error, Twine("invalid code point")); continue; } diff --git a/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp index 3dc1098..a457315 100644 --- a/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp +++ b/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp @@ -19,10 +19,11 @@ using namespace llvm; -void clang::EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS) { - std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Tag"); +void clang::EmitClangCommentHTMLTags(const RecordKeeper &Records, + raw_ostream &OS) { + ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Tag"); std::vector<StringMatcher::StringPair> Matches; - for (Record *Tag : Tags) { + for (const Record *Tag : Tags) { Matches.emplace_back(std::string(Tag->getValueAsString("Spelling")), "return true;"); } @@ -35,12 +36,12 @@ void clang::EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS) { << "}\n\n"; } -void clang::EmitClangCommentHTMLTagsProperties(RecordKeeper &Records, +void clang::EmitClangCommentHTMLTagsProperties(const RecordKeeper &Records, raw_ostream &OS) { - std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Tag"); + ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Tag"); std::vector<StringMatcher::StringPair> MatchesEndTagOptional; std::vector<StringMatcher::StringPair> MatchesEndTagForbidden; - for (Record *Tag : Tags) { + for (const Record *Tag : Tags) { std::string Spelling = std::string(Tag->getValueAsString("Spelling")); StringMatcher::StringPair Match(Spelling, "return true;"); if (Tag->getValueAsBit("EndTagOptional")) diff --git a/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp b/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp index 4508293..dae6710 100644 --- a/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp +++ b/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp @@ -4,7 +4,7 @@ using namespace llvm; -void clang::EmitClangDataCollectors(RecordKeeper &RK, raw_ostream &OS) { +void clang::EmitClangDataCollectors(const RecordKeeper &RK, raw_ostream &OS) { const auto &Defs = RK.getClasses(); for (const auto &Entry : Defs) { Record &R = *Entry.second; diff --git a/clang/utils/TableGen/ClangOpcodesEmitter.cpp b/clang/utils/TableGen/ClangOpcodesEmitter.cpp index 120e1e2..7e426d5 100644 --- a/clang/utils/TableGen/ClangOpcodesEmitter.cpp +++ b/clang/utils/TableGen/ClangOpcodesEmitter.cpp @@ -20,11 +20,11 @@ using namespace llvm; namespace { class ClangOpcodesEmitter { - RecordKeeper &Records; + const RecordKeeper &Records; unsigned NumTypes; public: - ClangOpcodesEmitter(RecordKeeper &R) + ClangOpcodesEmitter(const RecordKeeper &R) : Records(R), NumTypes(Records.getAllDerivedDefinitions("Type").size()) {} void run(raw_ostream &OS); @@ -404,6 +404,6 @@ void ClangOpcodesEmitter::PrintTypes(raw_ostream &OS, OS << ">"; } -void clang::EmitClangOpcodes(RecordKeeper &Records, raw_ostream &OS) { +void clang::EmitClangOpcodes(const RecordKeeper &Records, raw_ostream &OS) { ClangOpcodesEmitter(Records).run(OS); } diff --git a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp index 74c3a85..d68dcc4 100644 --- a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp +++ b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp @@ -87,7 +87,7 @@ struct BuiltinTableEntries { // class BuiltinNameEmitter { public: - BuiltinNameEmitter(RecordKeeper &Records, raw_ostream &OS) + BuiltinNameEmitter(const RecordKeeper &Records, raw_ostream &OS) : Records(Records), OS(OS) {} // Entrypoint to generate the functions and structures for checking @@ -100,7 +100,7 @@ private: // Contains OpenCL builtin functions and related information, stored as // Record instances. They are coming from the associated TableGen file. - RecordKeeper &Records; + const RecordKeeper &Records; // The output file. raw_ostream &OS; @@ -113,7 +113,7 @@ private: // \param Output (out) String containing the enums to emit in the output file. // \param List (out) List containing the extracted Types, except the Types in // TypesSeen. - void ExtractEnumTypes(std::vector<Record *> &Types, + void ExtractEnumTypes(ArrayRef<const Record *> Types, StringMap<bool> &TypesSeen, std::string &Output, std::vector<const Record *> &List); @@ -237,7 +237,7 @@ private: /// Base class for emitting a file (e.g. header or test) from OpenCLBuiltins.td class OpenCLBuiltinFileEmitterBase { public: - OpenCLBuiltinFileEmitterBase(RecordKeeper &Records, raw_ostream &OS) + OpenCLBuiltinFileEmitterBase(const RecordKeeper &Records, raw_ostream &OS) : Records(Records), OS(OS) {} virtual ~OpenCLBuiltinFileEmitterBase() = default; @@ -305,7 +305,7 @@ protected: // Contains OpenCL builtin functions and related information, stored as // Record instances. They are coming from the associated TableGen file. - RecordKeeper &Records; + const RecordKeeper &Records; // The output file. raw_ostream &OS; @@ -316,7 +316,7 @@ protected: // builtin function described in the .td input. class OpenCLBuiltinTestEmitter : public OpenCLBuiltinFileEmitterBase { public: - OpenCLBuiltinTestEmitter(RecordKeeper &Records, raw_ostream &OS) + OpenCLBuiltinTestEmitter(const RecordKeeper &Records, raw_ostream &OS) : OpenCLBuiltinFileEmitterBase(Records, OS) {} // Entrypoint to generate the functions for testing all OpenCL builtin @@ -329,7 +329,7 @@ public: // prototype for each builtin function described in the .td input. class OpenCLBuiltinHeaderEmitter : public OpenCLBuiltinFileEmitterBase { public: - OpenCLBuiltinHeaderEmitter(RecordKeeper &Records, raw_ostream &OS) + OpenCLBuiltinHeaderEmitter(const RecordKeeper &Records, raw_ostream &OS) : OpenCLBuiltinFileEmitterBase(Records, OS) {} // Entrypoint to generate the header. @@ -362,7 +362,7 @@ void BuiltinNameEmitter::Emit() { EmitQualTypeFinder(); } -void BuiltinNameEmitter::ExtractEnumTypes(std::vector<Record *> &Types, +void BuiltinNameEmitter::ExtractEnumTypes(ArrayRef<const Record *> Types, StringMap<bool> &TypesSeen, std::string &Output, std::vector<const Record *> &List) { @@ -392,11 +392,11 @@ void BuiltinNameEmitter::EmitDeclarations() { // Extract generic types and non-generic types separately, to keep // gentypes at the end of the enum which simplifies the special handling // for gentypes in SemaLookup. - std::vector<Record *> GenTypes = + ArrayRef<const Record *> GenTypes = Records.getAllDerivedDefinitions("GenericType"); ExtractEnumTypes(GenTypes, TypesSeen, GenTypeEnums, GenTypeList); - std::vector<Record *> Types = Records.getAllDerivedDefinitions("Type"); + ArrayRef<const Record *> Types = Records.getAllDerivedDefinitions("Type"); ExtractEnumTypes(Types, TypesSeen, TypeEnums, TypeList); OS << TypeEnums; @@ -499,7 +499,7 @@ static void VerifySignature(const std::vector<Record *> &Signature, void BuiltinNameEmitter::GetOverloads() { // Populate the TypeMap. - std::vector<Record *> Types = Records.getAllDerivedDefinitions("Type"); + ArrayRef<const Record *> Types = Records.getAllDerivedDefinitions("Type"); unsigned I = 0; for (const auto &T : Types) { TypeMap.insert(std::make_pair(T, I++)); @@ -507,7 +507,8 @@ void BuiltinNameEmitter::GetOverloads() { // Populate the SignaturesList and the FctOverloadMap. unsigned CumulativeSignIndex = 0; - std::vector<Record *> Builtins = Records.getAllDerivedDefinitions("Builtin"); + ArrayRef<const Record *> Builtins = + Records.getAllDerivedDefinitions("Builtin"); for (const auto *B : Builtins) { StringRef BName = B->getValueAsString("Name"); FctOverloadMap.try_emplace(BName); @@ -535,7 +536,7 @@ void BuiltinNameEmitter::GetOverloads() { void BuiltinNameEmitter::EmitExtensionTable() { OS << "static const char *FunctionExtensionTable[] = {\n"; unsigned Index = 0; - std::vector<Record *> FuncExtensions = + ArrayRef<const Record *> FuncExtensions = Records.getAllDerivedDefinitions("FunctionExtension"); for (const auto &FE : FuncExtensions) { @@ -804,11 +805,11 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty, OS << "\n switch (Ty.ID) {\n"; // Switch cases for image types (Image2d, Image3d, ...) - std::vector<Record *> ImageTypes = + ArrayRef<const Record *> ImageTypes = Records.getAllDerivedDefinitions("ImageType"); // Map an image type name to its 3 access-qualified types (RO, WO, RW). - StringMap<SmallVector<Record *, 3>> ImageTypesMap; + StringMap<SmallVector<const Record *, 3>> ImageTypesMap; for (auto *IT : ImageTypes) ImageTypesMap[IT->getValueAsString("Name")].push_back(IT); @@ -890,7 +891,7 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty, // Switch cases for non generic, non image types (int, int4, float, ...). // Only insert the plain scalar type; vector information and type qualifiers // are added in step 2. - std::vector<Record *> Types = Records.getAllDerivedDefinitions("Type"); + ArrayRef<const Record *> Types = Records.getAllDerivedDefinitions("Type"); StringMap<bool> TypesSeen; for (const auto *T : Types) { @@ -1211,7 +1212,8 @@ void OpenCLBuiltinTestEmitter::emit() { unsigned TestID = 0; // Iterate over all builtins. - std::vector<Record *> Builtins = Records.getAllDerivedDefinitions("Builtin"); + ArrayRef<const Record *> Builtins = + Records.getAllDerivedDefinitions("Builtin"); for (const auto *B : Builtins) { StringRef Name = B->getValueAsString("Name"); @@ -1274,7 +1276,8 @@ void OpenCLBuiltinHeaderEmitter::emit() { )"; // Iterate over all builtins; sort to follow order of definition in .td file. - std::vector<Record *> Builtins = Records.getAllDerivedDefinitions("Builtin"); + std::vector<const Record *> Builtins = + Records.getAllDerivedDefinitions("Builtin"); llvm::sort(Builtins, LessRecord()); for (const auto *B : Builtins) { @@ -1319,18 +1322,19 @@ void OpenCLBuiltinHeaderEmitter::emit() { "#pragma OPENCL EXTENSION all : disable\n"; } -void clang::EmitClangOpenCLBuiltins(RecordKeeper &Records, raw_ostream &OS) { +void clang::EmitClangOpenCLBuiltins(const RecordKeeper &Records, + raw_ostream &OS) { BuiltinNameEmitter NameChecker(Records, OS); NameChecker.Emit(); } -void clang::EmitClangOpenCLBuiltinHeader(RecordKeeper &Records, +void clang::EmitClangOpenCLBuiltinHeader(const RecordKeeper &Records, raw_ostream &OS) { OpenCLBuiltinHeaderEmitter HeaderFileGenerator(Records, OS); HeaderFileGenerator.emit(); } -void clang::EmitClangOpenCLBuiltinTests(RecordKeeper &Records, +void clang::EmitClangOpenCLBuiltinTests(const RecordKeeper &Records, raw_ostream &OS) { OpenCLBuiltinTestEmitter TestFileGenerator(Records, OS); TestFileGenerator.emit(); diff --git a/clang/utils/TableGen/ClangOptionDocEmitter.cpp b/clang/utils/TableGen/ClangOptionDocEmitter.cpp index 8683561..8c32f02 100644 --- a/clang/utils/TableGen/ClangOptionDocEmitter.cpp +++ b/clang/utils/TableGen/ClangOptionDocEmitter.cpp @@ -24,8 +24,8 @@ using namespace llvm; namespace { struct DocumentedOption { - Record *Option; - std::vector<Record*> Aliases; + const Record *Option; + std::vector<const Record *> Aliases; }; struct DocumentedGroup; struct Documentation { @@ -37,7 +37,7 @@ struct Documentation { } }; struct DocumentedGroup : Documentation { - Record *Group; + const Record *Group; }; static bool hasFlag(const Record *Option, StringRef OptionFlag, @@ -63,25 +63,25 @@ static bool isOptionVisible(const Record *Option, const Record *DocInfo) { } // Reorganize the records into a suitable form for emitting documentation. -Documentation extractDocumentation(RecordKeeper &Records, +Documentation extractDocumentation(const RecordKeeper &Records, const Record *DocInfo) { Documentation Result; // Build the tree of groups. The root in the tree is the fake option group // (Record*)nullptr, which contains all top-level groups and options. - std::map<Record*, std::vector<Record*> > OptionsInGroup; - std::map<Record*, std::vector<Record*> > GroupsInGroup; - std::map<Record*, std::vector<Record*> > Aliases; + std::map<const Record *, std::vector<const Record *>> OptionsInGroup; + std::map<const Record *, std::vector<const Record *>> GroupsInGroup; + std::map<const Record *, std::vector<const Record *>> Aliases; - std::map<std::string, Record*> OptionsByName; - for (Record *R : Records.getAllDerivedDefinitions("Option")) + std::map<std::string, const Record *> OptionsByName; + for (const Record *R : Records.getAllDerivedDefinitions("Option")) OptionsByName[std::string(R->getValueAsString("Name"))] = R; - auto Flatten = [](Record *R) { + auto Flatten = [](const Record *R) { return R->getValue("DocFlatten") && R->getValueAsBit("DocFlatten"); }; - auto SkipFlattened = [&](Record *R) -> Record* { + auto SkipFlattened = [&](const Record *R) -> const Record * { while (R && Flatten(R)) { auto *G = dyn_cast<DefInit>(R->getValueInit("Group")); if (!G) @@ -91,17 +91,17 @@ Documentation extractDocumentation(RecordKeeper &Records, return R; }; - for (Record *R : Records.getAllDerivedDefinitions("OptionGroup")) { + for (const Record *R : Records.getAllDerivedDefinitions("OptionGroup")) { if (Flatten(R)) continue; - Record *Group = nullptr; + const Record *Group = nullptr; if (auto *G = dyn_cast<DefInit>(R->getValueInit("Group"))) Group = SkipFlattened(G->getDef()); GroupsInGroup[Group].push_back(R); } - for (Record *R : Records.getAllDerivedDefinitions("Option")) { + for (const Record *R : Records.getAllDerivedDefinitions("Option")) { if (auto *A = dyn_cast<DefInit>(R->getValueInit("Alias"))) { Aliases[A->getDef()].push_back(R); continue; @@ -120,33 +120,33 @@ Documentation extractDocumentation(RecordKeeper &Records, } } - Record *Group = nullptr; + const Record *Group = nullptr; if (auto *G = dyn_cast<DefInit>(R->getValueInit("Group"))) Group = SkipFlattened(G->getDef()); OptionsInGroup[Group].push_back(R); } - auto CompareByName = [](Record *A, Record *B) { + auto CompareByName = [](const Record *A, const Record *B) { return A->getValueAsString("Name") < B->getValueAsString("Name"); }; - auto CompareByLocation = [](Record *A, Record *B) { + auto CompareByLocation = [](const Record *A, const Record *B) { return A->getLoc()[0].getPointer() < B->getLoc()[0].getPointer(); }; - auto DocumentationForOption = [&](Record *R) -> DocumentedOption { + auto DocumentationForOption = [&](const Record *R) -> DocumentedOption { auto &A = Aliases[R]; llvm::sort(A, CompareByName); return {R, std::move(A)}; }; - std::function<Documentation(Record *)> DocumentationForGroup = - [&](Record *R) -> Documentation { + std::function<Documentation(const Record *)> DocumentationForGroup = + [&](const Record *R) -> Documentation { Documentation D; auto &Groups = GroupsInGroup[R]; llvm::sort(Groups, CompareByLocation); - for (Record *G : Groups) { + for (const Record *G : Groups) { D.Groups.emplace_back(); D.Groups.back().Group = G; Documentation &Base = D.Groups.back(); @@ -157,7 +157,7 @@ Documentation extractDocumentation(RecordKeeper &Records, auto &Options = OptionsInGroup[R]; llvm::sort(Options, CompareByName); - for (Record *O : Options) + for (const Record *O : Options) if (isOptionVisible(O, DocInfo)) D.Options.push_back(DocumentationForOption(O)); @@ -444,7 +444,7 @@ void emitDocumentation(int Depth, const Documentation &Doc, } // namespace -void clang::EmitClangOptDocs(RecordKeeper &Records, raw_ostream &OS) { +void clang::EmitClangOptDocs(const RecordKeeper &Records, raw_ostream &OS) { const Record *DocInfo = Records.getDef("GlobalDocumentation"); if (!DocInfo) { PrintFatalError("The GlobalDocumentation top-level definition is missing, " diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index 3a424c9..fe55ef2 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -73,7 +73,8 @@ void EmitClangAttrNodeTraverse(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangAttrDocTable(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitClangBuiltins(const llvm::RecordKeeper &Records, + llvm::raw_ostream &OS); void EmitClangDiagsDefs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS, const std::string &Component); @@ -83,18 +84,18 @@ void EmitClangDiagsIndexName(llvm::RecordKeeper &Records, void EmitClangSACheckers(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangCommentHTMLTags(llvm::RecordKeeper &Records, +void EmitClangCommentHTMLTags(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangCommentHTMLTagsProperties(llvm::RecordKeeper &Records, +void EmitClangCommentHTMLTagsProperties(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangCommentHTMLNamedCharacterReferences(llvm::RecordKeeper &Records, - llvm::raw_ostream &OS); +void EmitClangCommentHTMLNamedCharacterReferences( + const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangCommentCommandInfo(llvm::RecordKeeper &Records, +void EmitClangCommentCommandInfo(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangCommentCommandList(llvm::RecordKeeper &Records, +void EmitClangCommentCommandList(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangOpcodes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitClangOpcodes(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangSyntaxNodeList(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); @@ -142,16 +143,16 @@ void EmitCdeBuiltinAliases(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangAttrDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangDiagDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangOptDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitClangOptDocs(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangOpenCLBuiltins(llvm::RecordKeeper &Records, +void EmitClangOpenCLBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangOpenCLBuiltinHeader(llvm::RecordKeeper &Records, +void EmitClangOpenCLBuiltinHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangOpenCLBuiltinTests(llvm::RecordKeeper &Records, +void EmitClangOpenCLBuiltinTests(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangDataCollectors(llvm::RecordKeeper &Records, +void EmitClangDataCollectors(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitTestPragmaAttributeSupportedAttributes(llvm::RecordKeeper &Records, diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c index 0dd3977..ea2da23 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64.c +++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c @@ -14,7 +14,7 @@ #include "aarch64.h" -#if !defined(__aarch64__) +#if !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64) #error This file is intended only for aarch64-based targets #endif diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp index bf84a2f..666e6f3 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp @@ -151,10 +151,6 @@ DTLS::DTV *DTLS_on_tls_get_addr(void *arg_void, void *res, // This may happen inside the DTOR of main thread, so just ignore it. tls_size = 0; } - if (tls_size) { - CHECK_LE(tls_beg, reinterpret_cast<uptr>(res) - kDtvOffset); - CHECK_LT(reinterpret_cast<uptr>(res) - kDtvOffset, tls_beg + tls_size); - } dtv->beg = tls_beg; dtv->size = tls_size; return dtv; diff --git a/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c b/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c index 857bec9..fd1ce0c 100644 --- a/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c +++ b/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c @@ -11,8 +11,8 @@ // RUN: %clangxx_asan -x c -DSO_NAME=f1 %s -shared -o %t-f1.so -fPIC // RUN: %clangxx_asan -x c -DSO_NAME=f2 %s -shared -o %t-f2.so -fPIC // RUN: %clangxx_asan %s -ldl -pthread -o %t -// RUN: %env_asan_opts=verbosity=2 %run not %t 0 3 -// RUN: %env_asan_opts=verbosity=2 %run %t 2 3 +// RUN: %run %t 0 3 +// RUN: %run %t 2 3 // RUN: %env_asan_opts=verbosity=2 %run %t 10 2 2>&1 | FileCheck %s // RUN: %env_asan_opts=verbosity=2:intercept_tls_get_addr=1 %run %t 10 2 2>&1 | FileCheck %s // RUN: %env_asan_opts=verbosity=2:intercept_tls_get_addr=0 %run %t 10 2 2>&1 | FileCheck %s --check-prefix=CHECK0 @@ -29,7 +29,6 @@ // CHECK-NOT: num_live_dtls 5 // // CHECK0-NOT: __tls_get_addr - /* cc=your-compiler diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 10aef72..e94a49f 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -2744,7 +2744,6 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric( (!procedure->IsElemental() && nonElemental)) { int d{ComputeCudaMatchingDistance( context_.languageFeatures(), *procedure, localActuals)}; - llvm::errs() << "matching distance: " << d << "\n"; if (d != crtMatchingDistance) { if (d > crtMatchingDistance) { continue; diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index 45dfe3e..8643c9b 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -75,6 +75,10 @@ function(_get_compile_options_from_config output_var) list(APPEND config_options "-DLIBC_TYPES_TIME_T_IS_32_BIT") endif() + if(LIBC_ADD_NULL_CHECKS) + list(APPEND config_options "-DLIBC_ADD_NULL_CHECKS") + endif() + set(${output_var} ${config_options} PARENT_SCOPE) endfunction(_get_compile_options_from_config) diff --git a/libc/config/config.json b/libc/config/config.json index 2e72c0a..7dfbb56 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -94,5 +94,11 @@ "value": false, "doc": "Force the size of time_t to 64 bits, even on platforms where compatibility considerations would otherwise make it 32-bit." } + }, + "general": { + "LIBC_ADD_NULL_CHECKS": { + "value": true, + "doc": "Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior." + } } } diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index 54ca5d5..86875d4 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -30,6 +30,8 @@ to learn about the defaults for your platform and target. - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience. * **"errno" options** - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM. +* **"general" options** + - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior. * **"math" options** - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST. * **"printf" options** diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt index a2fad9b..e0b65b7 100644 --- a/libc/hdr/CMakeLists.txt +++ b/libc/hdr/CMakeLists.txt @@ -143,4 +143,22 @@ add_proxy_header_library( libc.include.llvm-libc-macros.limits_macros ) +add_proxy_header_library( + link_macros + HDRS + link_macros.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-macros.link_macros + libc.include.link +) + +add_proxy_header_library( + sys_auxv_macros + HDRS + sys_auxv_macros.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-macros.sys_auxv_macros + libc.include.sys_auxv +) + add_subdirectory(types) diff --git a/libc/hdr/link_macros.h b/libc/hdr/link_macros.h new file mode 100644 index 0000000..8a78a86 --- /dev/null +++ b/libc/hdr/link_macros.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from link.h ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_LINK_MACROS_H +#define LLVM_LIBC_HDR_LINK_MACROS_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-macros/link-macros.h" + +#else // Overlay mode + +#include <link.h> + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_LINK_MACROS_H diff --git a/libc/hdr/sys_auxv_macros.h b/libc/hdr/sys_auxv_macros.h new file mode 100644 index 0000000..c04011b --- /dev/null +++ b/libc/hdr/sys_auxv_macros.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from sys/auxv.h ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_SYS_AUXV_MACROS_H +#define LLVM_LIBC_HDR_SYS_AUXV_MACROS_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-macros/sys-auxv-macros.h" + +#else // Overlay mode + +#include <sys/auxv.h> + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_SYS_AUXV_MACROS_H diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index 9bd1e29..0302ad6 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -192,6 +192,9 @@ add_header_library( libc.src.__support.CPP.optional libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.rounding_mode + libc.src.__support.macros.config + libc.src.__support.macros.null_check + libc.src.__support.macros.optimization libc.src.errno.errno ) diff --git a/libc/src/__support/OSUtil/linux/CMakeLists.txt b/libc/src/__support/OSUtil/linux/CMakeLists.txt index 089cad4..6c70149 100644 --- a/libc/src/__support/OSUtil/linux/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/CMakeLists.txt @@ -23,3 +23,33 @@ add_object_library( libc.hdr.types.struct_f_owner_ex libc.hdr.types.off_t ) + +add_header_library( + vdso_sym + HDRS + vdso_sym.h + DEPENDS + libc.src.__support.common +) + +add_object_library( + vdso + HDRS + vdso.h + SRCS + vdso.cpp + DEPENDS + .${LIBC_TARGET_ARCHITECTURE}.vdso + libc.src.__support.CPP.array + libc.src.__support.CPP.optional + libc.src.__support.CPP.string_view + libc.src.__support.threads.callonce + libc.src.__support.threads.linux.futex_word_type + libc.hdr.types.struct_timeval + libc.hdr.types.struct_timespec + libc.hdr.types.clockid_t + libc.hdr.types.time_t + libc.hdr.link_macros + libc.src.errno.errno + libc.src.sys.auxv.getauxval +) diff --git a/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt b/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt index eea9bad..d9451a1 100644 --- a/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt @@ -5,3 +5,13 @@ add_header_library( DEPENDS libc.src.__support.common ) + +add_header_library( + vdso + HDRS + vdso.h + DEPENDS + libc.src.__support.common + libc.src.__support.CPP.string_view + libc.src.__support.OSUtil.linux.vdso_sym +) diff --git a/libc/src/__support/OSUtil/linux/aarch64/vdso.h b/libc/src/__support/OSUtil/linux/aarch64/vdso.h new file mode 100644 index 0000000..3c4c620 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/aarch64/vdso.h @@ -0,0 +1,37 @@ +//===---------- aarch64 vdso configuration ------------------------* C++ *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H +#include "src/__support/CPP/string_view.h" +#include "src/__support/OSUtil/linux/vdso_sym.h" +namespace LIBC_NAMESPACE_DECL { +namespace vdso { +// translate VDSOSym to symbol names +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/kernel/vdso/vdso.lds.S +LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) { + switch (sym) { + case VDSOSym::RTSigReturn: + return "__kernel_rt_sigreturn"; + case VDSOSym::GetTimeOfDay: + return "__kernel_gettimeofday"; + case VDSOSym::ClockGetTime: + return "__kernel_clock_gettime"; + case VDSOSym::ClockGetRes: + return "__kernel_clock_getres"; + default: + return ""; + } +} + +// symbol versions +LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) { + return "LINUX_2.6.39"; +} +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H diff --git a/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt b/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt index 733366f..d991f7e 100644 --- a/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt @@ -5,3 +5,13 @@ add_header_library( DEPENDS libc.src.__support.common ) + +add_header_library( + vdso + HDRS + vdso.h + DEPENDS + libc.src.__support.common + libc.src.__support.CPP.string_view + libc.src.__support.OSUtil.linux.vdso_sym +) diff --git a/libc/src/__support/OSUtil/linux/arm/vdso.h b/libc/src/__support/OSUtil/linux/arm/vdso.h new file mode 100644 index 0000000..3de5860 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/arm/vdso.h @@ -0,0 +1,37 @@ +//===---------- arm vdso configuration ----------------------------* C++ *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H +#include "src/__support/CPP/string_view.h" +#include "src/__support/OSUtil/linux/vdso_sym.h" +namespace LIBC_NAMESPACE_DECL { +namespace vdso { +// translate VDSOSym to symbol names +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm/vdso/vdso.lds.S +LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) { + switch (sym) { + case VDSOSym::ClockGetTime: + return "__vdso_clock_gettime"; + case VDSOSym::GetTimeOfDay: + return "__vdso_gettimeofday"; + case VDSOSym::ClockGetRes: + return "__vdso_clock_getres"; + case VDSOSym::ClockGetTime64: + return "__vdso_clock_gettime64"; + default: + return ""; + } +} + +// symbol versions +LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) { + return "LINUX_2.6"; +} +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H diff --git a/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt b/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt index e271204..eb93dd4 100644 --- a/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt @@ -5,3 +5,13 @@ add_header_library( DEPENDS libc.src.__support.common ) + +add_header_library( + vdso + HDRS + vdso.h + DEPENDS + libc.src.__support.common + libc.src.__support.CPP.string_view + libc.src.__support.OSUtil.linux.vdso_sym +) diff --git a/libc/src/__support/OSUtil/linux/riscv/vdso.h b/libc/src/__support/OSUtil/linux/riscv/vdso.h new file mode 100644 index 0000000..24ddb25 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/riscv/vdso.h @@ -0,0 +1,43 @@ +//===---------- RISC-V vdso configuration -------------------------* C++ *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H +#include "src/__support/CPP/string_view.h" +#include "src/__support/OSUtil/linux/vdso_sym.h" +namespace LIBC_NAMESPACE_DECL { +namespace vdso { +// translate VDSOSym to symbol names +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/riscv/kernel/vdso/vdso.lds.S +LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) { + switch (sym) { + case VDSOSym::RTSigReturn: + return "__vdso_rt_sigreturn"; + case VDSOSym::GetTimeOfDay: + return "__vdso_gettimeofday"; + case VDSOSym::ClockGetTime: + return "__vdso_clock_gettime"; + case VDSOSym::ClockGetRes: + return "__vdso_clock_getres"; + case VDSOSym::GetCpu: + return "__vdso_getcpu"; + case VDSOSym::FlushICache: + return "__vdso_flush_icache"; + case VDSOSym::RiscvHwProbe: + return "__vdso_riscv_hwprobe"; + default: + return ""; + } +} + +// symbol versions +LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) { + return "LINUX_4.15"; +} +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H diff --git a/libc/src/__support/OSUtil/linux/vdso.cpp b/libc/src/__support/OSUtil/linux/vdso.cpp new file mode 100644 index 0000000..cb43764 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/vdso.cpp @@ -0,0 +1,237 @@ +//===------------- Linux VDSO Implementation --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "src/__support/OSUtil/linux/vdso.h" +#include "hdr/link_macros.h" +#include "hdr/sys_auxv_macros.h" +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/optional.h" +#include "src/__support/CPP/string_view.h" +#include "src/__support/threads/callonce.h" +#include "src/__support/threads/linux/futex_word.h" +#include "src/errno/libc_errno.h" +#include "src/sys/auxv/getauxval.h" +#include <linux/auxvec.h> + +// TODO: This is a temporary workaround to avoid including elf.h +// Include our own headers for ElfW and friends once we have them. +namespace LIBC_NAMESPACE_DECL { + +namespace vdso { + +Symbol::VDSOArray Symbol::global_cache{}; +CallOnceFlag Symbol::once_flag = callonce_impl::NOT_CALLED; + +namespace { +// See https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/symverdefs.html +struct Verdaux { + ElfW(Word) vda_name; /* Version or dependency names */ + ElfW(Word) vda_next; /* Offset in bytes to next verdaux + entry */ +}; +struct Verdef { + ElfW(Half) vd_version; /* Version revision */ + ElfW(Half) vd_flags; /* Version information */ + ElfW(Half) vd_ndx; /* Version Index */ + ElfW(Half) vd_cnt; /* Number of associated aux entries */ + ElfW(Word) vd_hash; /* Version name hash value */ + ElfW(Word) vd_aux; /* Offset in bytes to verdaux array */ + ElfW(Word) vd_next; /* Offset in bytes to next verdef entry */ + Verdef *next() const { + if (vd_next == 0) + return nullptr; + return reinterpret_cast<Verdef *>(reinterpret_cast<uintptr_t>(this) + + vd_next); + } + Verdaux *aux() const { + return reinterpret_cast<Verdaux *>(reinterpret_cast<uintptr_t>(this) + + vd_aux); + } +}; + +// version search procedure specified by +// https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/symversion.html#SYMVERTBL +cpp::string_view find_version(Verdef *verdef, ElfW(Half) * versym, + const char *strtab, size_t idx) { + constexpr ElfW(Half) VER_FLG_BASE = 0x1; + if (!versym) + return ""; + ElfW(Half) identifier = versym[idx] & 0x7FFF; + // iterate through all version definitions + for (Verdef *def = verdef; def != nullptr; def = def->next()) { + // skip if this is a file-level version + if (def->vd_flags & VER_FLG_BASE) + continue; + // check if the version identifier matches. Highest bit is used to determine + // whether the symbol is local. Only lower 15 bits are used for version + // identifier. + if ((def->vd_ndx & 0x7FFF) == identifier) { + Verdaux *aux = def->aux(); + return strtab + aux->vda_name; + } + } + return ""; +} + +size_t shdr_get_symbol_count(ElfW(Shdr) * vdso_shdr, size_t e_shnum) { + if (!vdso_shdr) + return 0; + // iterate all sections until we locate the dynamic symbol section + for (size_t i = 0; i < e_shnum; ++i) { + // dynamic symbol section is a table section + // therefore, the number of entries can be computed as the ratio + // of the section size to the size of a single entry + if (vdso_shdr[i].sh_type == SHT_DYNSYM) + return vdso_shdr[i].sh_size / vdso_shdr[i].sh_entsize; + } + return 0; +} + +struct VDSOSymbolTable { + const char *strtab; + ElfW(Sym) * symtab; + // The following can be nullptr if the vDSO does not have versioning + ElfW(Half) * versym; + Verdef *verdef; + + void populate_symbol_cache(Symbol::VDSOArray &symbol_table, + size_t symbol_count, ElfW(Addr) vdso_addr) { + for (size_t i = 0, e = symbol_table.size(); i < e; ++i) { + Symbol sym = i; + cpp::string_view name = sym.name(); + cpp::string_view version = sym.version(); + if (name.empty()) + continue; + + for (size_t j = 0; j < symbol_count; ++j) { + if (name == strtab + symtab[j].st_name) { + // we find a symbol with desired name + // now we need to check if it has the right version + if (versym && verdef && + version != find_version(verdef, versym, strtab, j)) + continue; + + // put the symbol address into the symbol table + symbol_table[i] = + reinterpret_cast<void *>(vdso_addr + symtab[j].st_value); + } + } + } + } +}; + +struct PhdrInfo { + ElfW(Addr) vdso_addr; + ElfW(Dyn) * vdso_dyn; + static cpp::optional<PhdrInfo> from(ElfW(Phdr) * vdso_phdr, size_t e_phnum, + uintptr_t vdso_ehdr_addr) { + constexpr ElfW(Addr) INVALID_ADDR = static_cast<ElfW(Addr)>(-1); + ElfW(Addr) vdso_addr = INVALID_ADDR; + ElfW(Dyn) *vdso_dyn = nullptr; + if (!vdso_phdr) + return cpp::nullopt; + // iterate through all the program headers until we get the desired pieces + for (size_t i = 0; i < e_phnum; ++i) { + if (vdso_phdr[i].p_type == PT_DYNAMIC) + vdso_dyn = reinterpret_cast<ElfW(Dyn) *>(vdso_ehdr_addr + + vdso_phdr[i].p_offset); + + if (vdso_phdr[i].p_type == PT_LOAD) + vdso_addr = + vdso_ehdr_addr + vdso_phdr[i].p_offset - vdso_phdr[i].p_vaddr; + + if (vdso_addr && vdso_dyn) + return PhdrInfo{vdso_addr, vdso_dyn}; + } + + return cpp::nullopt; + } + + cpp::optional<VDSOSymbolTable> populate_symbol_table() { + const char *strtab = nullptr; + ElfW(Sym) *symtab = nullptr; + ElfW(Half) *versym = nullptr; + Verdef *verdef = nullptr; + for (ElfW(Dyn) *d = vdso_dyn; d->d_tag != DT_NULL; ++d) { + switch (d->d_tag) { + case DT_STRTAB: + strtab = reinterpret_cast<const char *>(vdso_addr + d->d_un.d_ptr); + break; + case DT_SYMTAB: + symtab = reinterpret_cast<ElfW(Sym) *>(vdso_addr + d->d_un.d_ptr); + break; + case DT_VERSYM: + versym = reinterpret_cast<uint16_t *>(vdso_addr + d->d_un.d_ptr); + break; + case DT_VERDEF: + verdef = reinterpret_cast<Verdef *>(vdso_addr + d->d_un.d_ptr); + break; + } + if (strtab && symtab && versym && verdef) + break; + } + if (strtab == nullptr || symtab == nullptr) + return cpp::nullopt; + + return VDSOSymbolTable{strtab, symtab, versym, verdef}; + } +}; +} // namespace + +void Symbol::initialize_vdso_global_cache() { + // first clear the symbol table + for (auto &i : global_cache) + i = nullptr; + + // get the address of the VDSO, protect errno since getauxval may change + // it + int errno_backup = libc_errno; + uintptr_t vdso_ehdr_addr = getauxval(AT_SYSINFO_EHDR); + // Get the memory address of the vDSO ELF header. + auto vdso_ehdr = reinterpret_cast<ElfW(Ehdr) *>(vdso_ehdr_addr); + // leave the table unpopulated if we don't have vDSO + if (vdso_ehdr == nullptr) { + libc_errno = errno_backup; + return; + } + + // locate the section header inside the elf using the section header + // offset + auto vdso_shdr = + reinterpret_cast<ElfW(Shdr) *>(vdso_ehdr_addr + vdso_ehdr->e_shoff); + size_t symbol_count = shdr_get_symbol_count(vdso_shdr, vdso_ehdr->e_shnum); + + // early return if no symbol is found + if (symbol_count == 0) + return; + + // We need to find both the loadable segment and the dynamic linking of + // the vDSO. compute vdso_phdr as the program header using the program + // header offset + ElfW(Phdr) *vdso_phdr = + reinterpret_cast<ElfW(Phdr) *>(vdso_ehdr_addr + vdso_ehdr->e_phoff); + cpp::optional<PhdrInfo> phdr_info = + PhdrInfo::from(vdso_phdr, vdso_ehdr->e_phnum, vdso_ehdr_addr); + // early return if either the dynamic linking or the loadable segment is + // not found + if (!phdr_info.has_value()) + return; + + // now, locate several more tables inside the dynmaic linking section + cpp::optional<VDSOSymbolTable> vdso_symbol_table = + phdr_info->populate_symbol_table(); + + // early return if we can't find any required fields of the symbol table + if (!vdso_symbol_table.has_value()) + return; + + // finally, populate the global symbol table cache + vdso_symbol_table->populate_symbol_cache(global_cache, symbol_count, + phdr_info->vdso_addr); +} +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/OSUtil/linux/vdso.h b/libc/src/__support/OSUtil/linux/vdso.h new file mode 100644 index 0000000..a5108b3 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/vdso.h @@ -0,0 +1,81 @@ +//===------------- Linux VDSO Header ----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H +#include "src/__support/CPP/array.h" +#include "src/__support/common.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/properties/architectures.h" +#include "src/__support/threads/callonce.h" + +#if defined(LIBC_TARGET_ARCH_IS_X86) +#include "x86_64/vdso.h" +#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) +#include "aarch64/vdso.h" +#elif defined(LIBC_TARGET_ARCH_IS_ARM) +#include "arm/vdso.h" +#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV) +#include "riscv/vdso.h" +#else +#error "unknown arch" +#endif + +namespace LIBC_NAMESPACE_DECL { +namespace vdso { + +class Symbol { + VDSOSym sym; + +public: + LIBC_INLINE_VAR static constexpr size_t COUNT = + static_cast<size_t>(VDSOSym::VDSOSymCount); + LIBC_INLINE constexpr explicit Symbol(VDSOSym sym) : sym(sym) {} + LIBC_INLINE constexpr Symbol(size_t idx) : sym(static_cast<VDSOSym>(idx)) {} + LIBC_INLINE constexpr cpp::string_view name() const { + return symbol_name(sym); + } + LIBC_INLINE constexpr cpp::string_view version() const { + return symbol_version(sym); + } + LIBC_INLINE constexpr operator size_t() const { + return static_cast<size_t>(sym); + } + LIBC_INLINE constexpr bool is_valid() const { + return *this < Symbol::global_cache.size(); + } + using VDSOArray = cpp::array<void *, Symbol::COUNT>; + +private: + static CallOnceFlag once_flag; + static VDSOArray global_cache; + static void initialize_vdso_global_cache(); + + LIBC_INLINE void *get() const { + if (name().empty() || !is_valid()) + return nullptr; + + callonce(&once_flag, Symbol::initialize_vdso_global_cache); + return (global_cache[*this]); + } + template <VDSOSym sym> friend struct TypedSymbol; +}; + +template <VDSOSym sym> struct TypedSymbol { + LIBC_INLINE constexpr operator VDSOSymType<sym>() const { + return cpp::bit_cast<VDSOSymType<sym>>(Symbol{sym}.get()); + } + template <typename... Args> + LIBC_INLINE auto operator()(Args &&...args) const { + return this->operator VDSOSymType<sym>()(cpp::forward<Args>(args)...); + } +}; + +} // namespace vdso + +} // namespace LIBC_NAMESPACE_DECL +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H diff --git a/libc/src/__support/OSUtil/linux/vdso_sym.h b/libc/src/__support/OSUtil/linux/vdso_sym.h new file mode 100644 index 0000000..eb5f204 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/vdso_sym.h @@ -0,0 +1,70 @@ +//===------------- Linux VDSO Symbols ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "hdr/types/clock_t.h" +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "hdr/types/struct_timeval.h" +#include "hdr/types/time_t.h" +#include "src/__support/common.h" +#include <stddef.h> // For size_t. + +// NOLINTBEGIN(llvmlibc-implementation-in-namespace) +// TODO: some of the following can be defined via proxy headers. +struct __kernel_timespec; +struct timezone; +struct riscv_hwprobe; +struct getcpu_cache; +struct cpu_set_t; +// NOLINTEND(llvmlibc-implementation-in-namespace) + +namespace LIBC_NAMESPACE_DECL { +namespace vdso { + +enum class VDSOSym { + ClockGetTime, + ClockGetTime64, + GetTimeOfDay, + GetCpu, + Time, + ClockGetRes, + RTSigReturn, + FlushICache, + RiscvHwProbe, + VDSOSymCount +}; + +template <VDSOSym sym> LIBC_INLINE constexpr auto dispatcher() { + if constexpr (sym == VDSOSym::ClockGetTime) + return static_cast<int (*)(clockid_t, timespec *)>(nullptr); + else if constexpr (sym == VDSOSym::ClockGetTime64) + return static_cast<int (*)(clockid_t, __kernel_timespec *)>(nullptr); + else if constexpr (sym == VDSOSym::GetTimeOfDay) + return static_cast<int (*)(timeval *__restrict, timezone *__restrict)>( + nullptr); + else if constexpr (sym == VDSOSym::GetCpu) + return static_cast<int (*)(unsigned *, unsigned *, getcpu_cache *)>( + nullptr); + else if constexpr (sym == VDSOSym::Time) + return static_cast<time_t (*)(time_t *)>(nullptr); + else if constexpr (sym == VDSOSym::ClockGetRes) + return static_cast<int (*)(clockid_t, timespec *)>(nullptr); + else if constexpr (sym == VDSOSym::RTSigReturn) + return static_cast<void (*)(void)>(nullptr); + else if constexpr (sym == VDSOSym::FlushICache) + return static_cast<void (*)(void *, void *, unsigned int)>(nullptr); + else if constexpr (sym == VDSOSym::RiscvHwProbe) + return static_cast<int (*)(riscv_hwprobe *, size_t, size_t, cpu_set_t *, + unsigned)>(nullptr); + else + return static_cast<void *>(nullptr); +} + +template <VDSOSym sym> using VDSOSymType = decltype(dispatcher<sym>()); + +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt b/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt index a7f2d74e6..1324491 100644 --- a/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt @@ -5,3 +5,13 @@ add_header_library( DEPENDS libc.src.__support.common ) + +add_header_library( + vdso + HDRS + vdso.h + DEPENDS + libc.src.__support.common + libc.src.__support.CPP.string_view + libc.src.__support.OSUtil.linux.vdso_sym +) diff --git a/libc/src/__support/OSUtil/linux/x86_64/vdso.h b/libc/src/__support/OSUtil/linux/x86_64/vdso.h new file mode 100644 index 0000000..abe7c33 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/x86_64/vdso.h @@ -0,0 +1,43 @@ +//===---------- x86/x86_64 vdso configuration ---------------------* C++ *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H +#include "src/__support/CPP/string_view.h" +#include "src/__support/OSUtil/linux/vdso_sym.h" +namespace LIBC_NAMESPACE_DECL { +namespace vdso { +// translate VDSOSym to symbol names +// On x86, there are symbols defined without the __vdso_ prefix, however, +// it is suggested that one should use the __vdso_ prefix. +// Additionally, there is also an __vdso_sgx_enter_enclave, it is for the SGX +// support, we do not include it here for now. +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/entry/vdso/vdso.lds.S +LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) { + switch (sym) { + case VDSOSym::ClockGetTime: + return "__vdso_clock_gettime"; + case VDSOSym::GetTimeOfDay: + return "__vdso_gettimeofday"; + case VDSOSym::GetCpu: + return "__vdso_getcpu"; + case VDSOSym::Time: + return "__vdso_time"; + case VDSOSym::ClockGetRes: + return "__vdso_clock_getres"; + default: + return ""; + } +} + +// symbol versions +LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) { + return "LINUX_2.6"; +} +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H diff --git a/libc/src/__support/macros/CMakeLists.txt b/libc/src/__support/macros/CMakeLists.txt index bcd47c3..99d4f64 100644 --- a/libc/src/__support/macros/CMakeLists.txt +++ b/libc/src/__support/macros/CMakeLists.txt @@ -27,3 +27,13 @@ add_header_library( DEPENDS libc.src.__support.macros.properties.compiler ) + +add_header_library( + null_check + HDRS + null_check.h + DEPENDS + .config + .optimization + .sanitizer +) diff --git a/libc/src/__support/macros/null_check.h b/libc/src/__support/macros/null_check.h new file mode 100644 index 0000000..400f7d8 --- /dev/null +++ b/libc/src/__support/macros/null_check.h @@ -0,0 +1,33 @@ +//===-- Safe nullptr check --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H +#define LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" +#include "src/__support/macros/sanitizer.h" + +#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER) +// Use volatile to prevent undefined behavior of dereferencing nullptr. +// Intentionally crashing with SIGSEGV. +#define LIBC_CRASH_ON_NULLPTR(PTR) \ + do { \ + if (LIBC_UNLIKELY(PTR == nullptr)) { \ + volatile auto *crashing = PTR; \ + [[maybe_unused]] volatile auto crash = *crashing; \ + __builtin_trap(); \ + } \ + } while (0) +#else +#define LIBC_CRASH_ON_NULLPTR(ptr) \ + do { \ + } while (0) +#endif + +#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H diff --git a/libc/src/__support/macros/sanitizer.h b/libc/src/__support/macros/sanitizer.h index c4f8b5b..c20412e 100644 --- a/libc/src/__support/macros/sanitizer.h +++ b/libc/src/__support/macros/sanitizer.h @@ -15,7 +15,25 @@ // Functions to unpoison memory //----------------------------------------------------------------------------- +#if LIBC_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__) +#define LIBC_HAS_ADDRESS_SANITIZER +#endif + #if LIBC_HAS_FEATURE(memory_sanitizer) +#define LIBC_HAS_MEMORY_SANITIZER +#endif + +#if LIBC_HAS_FEATURE(undefined_behavior_sanitizer) +#define LIBC_HAS_UNDEFINED_BEHAVIOR_SANITIZER +#endif + +#if defined(LIBC_HAS_ADDRESS_SANITIZER) || \ + defined(LIBC_HAS_MEMORY_SANITIZER) || \ + defined(LIBC_HAS_UNDEFINED_BEHAVIOR_SANITIZER) +#define LIBC_HAS_SANITIZER +#endif + +#ifdef LIBC_HAS_MEMORY_SANITIZER // Only perform MSAN unpoison in non-constexpr context. #include <sanitizer/msan_interface.h> #define MSAN_UNPOISON(addr, size) \ @@ -27,8 +45,7 @@ #define MSAN_UNPOISON(ptr, size) #endif -#if LIBC_HAS_FEATURE(address_sanitizer) -#define LIBC_HAVE_ADDRESS_SANITIZER +#ifdef LIBC_HAS_ADDRESS_SANITIZER #include <sanitizer/asan_interface.h> #define ASAN_POISON_MEMORY_REGION(addr, size) \ __asan_poison_memory_region((addr), (size)) diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h index ffd6ebf..a452b3a 100644 --- a/libc/src/__support/str_to_float.h +++ b/libc/src/__support/str_to_float.h @@ -20,6 +20,8 @@ #include "src/__support/detailed_powers_of_ten.h" #include "src/__support/high_precision_decimal.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/null_check.h" +#include "src/__support/macros/optimization.h" #include "src/__support/str_to_integer.h" #include "src/__support/str_to_num_result.h" #include "src/__support/uint128.h" @@ -1208,6 +1210,8 @@ template <class T> LIBC_INLINE StrToNumResult<T> strtonan(const char *arg) { using FPBits = typename fputil::FPBits<T>; using StorageType = typename FPBits::StorageType; + LIBC_CRASH_ON_NULLPTR(arg); + FPBits result; int error = 0; StorageType nan_mantissa = 0; diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp index c8a15dd..799aad1 100644 --- a/libc/src/stdlib/atexit.cpp +++ b/libc/src/stdlib/atexit.cpp @@ -16,6 +16,7 @@ namespace LIBC_NAMESPACE_DECL { constinit ExitCallbackList atexit_callbacks; Mutex handler_list_mtx(false, false, false, false); +[[gnu::weak]] extern void teardown_main_tls(); extern "C" { @@ -24,8 +25,11 @@ int __cxa_atexit(AtExitCallback *callback, void *payload, void *) { } void __cxa_finalize(void *dso) { - if (!dso) + if (!dso) { call_exit_callbacks(atexit_callbacks); + if (teardown_main_tls) + teardown_main_tls(); + } } } // extern "C" diff --git a/libc/src/stdlib/quick_exit.cpp b/libc/src/stdlib/quick_exit.cpp index a5abf3e..29110b3 100644 --- a/libc/src/stdlib/quick_exit.cpp +++ b/libc/src/stdlib/quick_exit.cpp @@ -16,9 +16,12 @@ namespace LIBC_NAMESPACE_DECL { extern ExitCallbackList at_quick_exit_callbacks; +[[gnu::weak]] extern void teardown_main_tls(); [[noreturn]] LLVM_LIBC_FUNCTION(void, quick_exit, (int status)) { call_exit_callbacks(at_quick_exit_callbacks); + if (teardown_main_tls) + teardown_main_tls(); internal::exit(status); } diff --git a/libc/src/sys/auxv/getauxval.h b/libc/src/sys/auxv/getauxval.h index 3e69713..d9da45ff 100644 --- a/libc/src/sys/auxv/getauxval.h +++ b/libc/src/sys/auxv/getauxval.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H #define LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H +#include "hdr/sys_auxv_macros.h" #include "src/__support/macros/config.h" -#include <sys/auxv.h> namespace LIBC_NAMESPACE_DECL { diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp index f09541b..8bd0c3a 100644 --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -17,6 +17,9 @@ extern "C" int main(int argc, char **argv, char **envp); namespace LIBC_NAMESPACE_DECL { +// FIXME: Factor this out into common logic so we don't need to stub it here. +void teardown_main_tls() {} + DataEnvironment app; extern "C" uintptr_t __init_array_start[]; diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp index ef1e63e..bc529b3 100644 --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -19,6 +19,9 @@ namespace LIBC_NAMESPACE_DECL { DataEnvironment app; +// FIXME: Factor this out into common logic so we don't need to stub it here. +void teardown_main_tls() {} + extern "C" { // Nvidia's 'nvlink' linker does not provide these symbols. We instead need // to manually create them and update the globals in the loader implememtation. diff --git a/libc/startup/linux/do_start.cpp b/libc/startup/linux/do_start.cpp index 72060b4..ff104c7 100644 --- a/libc/startup/linux/do_start.cpp +++ b/libc/startup/linux/do_start.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// #include "startup/linux/do_start.h" +#include "config/linux/app.h" #include "include/llvm-libc-macros/link-macros.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/macros/config.h" @@ -60,6 +61,10 @@ static void call_fini_array_callbacks() { } static ThreadAttributes main_thread_attrib; +static TLSDescriptor tls; +// We separate teardown_main_tls from callbacks as callback function themselves +// may require TLS. +void teardown_main_tls() { cleanup_tls(tls.addr, tls.size); } [[noreturn]] void do_start() { auto tid = syscall_impl<long>(SYS_gettid); @@ -122,7 +127,6 @@ static ThreadAttributes main_thread_attrib; // This descriptor has to be static since its cleanup function cannot // capture the context. - static TLSDescriptor tls; init_tls(tls); if (tls.size != 0 && !set_thread_ptr(tls.tp)) syscall_impl<long>(SYS_exit, 1); @@ -130,10 +134,7 @@ static ThreadAttributes main_thread_attrib; self.attrib = &main_thread_attrib; main_thread_attrib.atexit_callback_mgr = internal::get_thread_atexit_callback_mgr(); - // We register the cleanup_tls function to be the last atexit callback to be - // invoked. It will tear down the TLS. Other callbacks may depend on TLS (such - // as the stack protector canary). - atexit([]() { cleanup_tls(tls.addr, tls.size); }); + // We want the fini array callbacks to be run after other atexit // callbacks are run. So, we register them before running the init // array callbacks as they can potentially register their own atexit diff --git a/libc/test/src/__support/OSUtil/linux/CMakeLists.txt b/libc/test/src/__support/OSUtil/linux/CMakeLists.txt index bfb072c..ff82616 100644 --- a/libc/test/src/__support/OSUtil/linux/CMakeLists.txt +++ b/libc/test/src/__support/OSUtil/linux/CMakeLists.txt @@ -1,3 +1,21 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE}) add_subdirectory(${LIBC_TARGET_ARCHITECTURE}) endif() + +add_libc_test( + vdso_test + SUITE libc-osutil-tests + SRCS vdso_test.cpp + DEPENDS + libc.src.__support.OSUtil.linux.vdso + libc.src.__support.OSUtil.osutil + libc.hdr.types.struct_sigaction + libc.hdr.types.struct_timeval + libc.hdr.types.struct_timespec + libc.hdr.types.clockid_t + libc.hdr.types.time_t + libc.hdr.time_macros + libc.hdr.signal_macros + libc.src.signal.sigaction + libc.src.signal.raise +) diff --git a/libc/test/src/__support/OSUtil/linux/vdso_test.cpp b/libc/test/src/__support/OSUtil/linux/vdso_test.cpp new file mode 100644 index 0000000..2363db6 --- /dev/null +++ b/libc/test/src/__support/OSUtil/linux/vdso_test.cpp @@ -0,0 +1,162 @@ +//===-- Unittests for VDSO ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/signal_macros.h" +#include "hdr/time_macros.h" +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_sigaction.h" +#include "hdr/types/struct_timespec.h" +#include "hdr/types/struct_timeval.h" +#include "hdr/types/time_t.h" +#include "src/__support/OSUtil/linux/vdso.h" +#include "src/__support/OSUtil/syscall.h" +#include "src/__support/macros/properties/architectures.h" +#include "src/signal/raise.h" +#include "src/signal/sigaction.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/LibcTest.h" +#include "test/UnitTest/Test.h" +#include <linux/time_types.h> +#include <sys/syscall.h> + +struct riscv_hwprobe { + int64_t key; + uint64_t value; +}; + +namespace LIBC_NAMESPACE_DECL { +// For x86_64, we explicitly test some traditional vdso symbols are indeed +// available. + +TEST(LlvmLibcOSUtilVDSOTest, GetTimeOfDay) { + vdso::TypedSymbol<vdso::VDSOSym::GetTimeOfDay> symbol; + if (!symbol) + return; + timeval tv; + EXPECT_EQ(symbol(&tv, nullptr), 0); + // hopefully people are not building time machines using our libc. + EXPECT_GT(tv.tv_sec, static_cast<decltype(tv.tv_sec)>(0)); +} + +TEST(LlvmLibcOSUtilVDSOTest, Time) { + vdso::TypedSymbol<vdso::VDSOSym::Time> symbol; + if (!symbol) + return; + time_t a, b; + EXPECT_GT(symbol(&a), static_cast<time_t>(0)); + EXPECT_GT(symbol(&b), static_cast<time_t>(0)); + EXPECT_GE(b, a); +} + +TEST(LlvmLibcOSUtilVDSOTest, ClockGetTime) { + vdso::TypedSymbol<vdso::VDSOSym::ClockGetTime> symbol; + if (!symbol) + return; + timespec a, b; + EXPECT_EQ(symbol(CLOCK_MONOTONIC, &a), 0); + EXPECT_EQ(symbol(CLOCK_MONOTONIC, &b), 0); + if (a.tv_sec == b.tv_sec) { + EXPECT_LT(a.tv_nsec, b.tv_nsec); + } else { + EXPECT_LT(a.tv_sec, b.tv_sec); + } +} + +TEST(LlvmLibcOSUtilVDSOTest, ClockGetTime64) { + vdso::TypedSymbol<vdso::VDSOSym::ClockGetTime64> symbol; + if (!symbol) + return; + // See kernel API at + // https://elixir.bootlin.com/linux/latest/source/tools/testing/selftests/vDSO/vdso_test_correctness.c#L155 + __kernel_timespec a, b; + EXPECT_EQ(symbol(CLOCK_MONOTONIC, &a), 0); + EXPECT_EQ(symbol(CLOCK_MONOTONIC, &b), 0); + if (a.tv_sec == b.tv_sec) { + EXPECT_LT(a.tv_nsec, b.tv_nsec); + } else { + EXPECT_LT(a.tv_sec, b.tv_sec); + } +} + +TEST(LlvmLibcOSUtilVDSOTest, ClockGetRes) { + vdso::TypedSymbol<vdso::VDSOSym::ClockGetRes> symbol; + if (!symbol) + return; + timespec res{}; + EXPECT_EQ(symbol(CLOCK_MONOTONIC, &res), 0); + EXPECT_TRUE(res.tv_sec > 0 || res.tv_nsec > 0); +} + +TEST(LlvmLibcOSUtilVDSOTest, GetCpu) { + // The kernel system call has a third argument, which should be passed as + // nullptr. + vdso::TypedSymbol<vdso::VDSOSym::GetCpu> symbol; + if (!symbol) + return; + unsigned cpu = static_cast<unsigned>(-1), node = static_cast<unsigned>(-1); + EXPECT_EQ(symbol(&cpu, &node, nullptr), 0); + EXPECT_GE(cpu, 0u); + EXPECT_GE(node, 0u); +} + +static bool flag = false; +static void sigprof_handler [[gnu::used]] (int) { flag = true; } + +TEST(LlvmLibcOSUtilVDSOTest, RtSigReturn) { + using namespace testing::ErrnoSetterMatcher; + // must use struct since there is a function of the same name in the same + // scope. + struct sigaction sa {}; + struct sigaction old_sa {}; + sa.sa_handler = sigprof_handler; + sa.sa_flags = SA_RESTORER; + vdso::TypedSymbol<vdso::VDSOSym::RTSigReturn> symbol; + if (!symbol) + return; + sa.sa_restorer = symbol; + ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGPROF, &sa, &old_sa), Succeeds()); + raise(SIGPROF); + ASSERT_TRUE(flag); + flag = false; + ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGPROF, &old_sa, nullptr), Succeeds()); +} + +TEST(LlvmLibcOSUtilVDSOTest, FlushICache) { + vdso::TypedSymbol<vdso::VDSOSym::FlushICache> symbol; + if (!symbol) + return; + char buf[512]; + // we just check that the flush will not panic the program. + // the flags part only take 0/1 as up to kernel 6.10, which is used to + // indicate whether the flush is local to the core or global. + symbol(buf, buf + sizeof(buf), 0); + symbol(buf, buf + sizeof(buf), 1); +} + +// https://docs.kernel.org/6.5/riscv/hwprobe.html +TEST(LlvmLibcOSUtilVDSOTest, RiscvHwProbe) { + using namespace testing::ErrnoSetterMatcher; + vdso::TypedSymbol<vdso::VDSOSym::RiscvHwProbe> symbol; + if (!symbol) + return; + // If a key is unknown to the kernel, its key field will be cleared to -1, and + // its value set to 0. We expect probes.value are all 0. + // Usermode can supply NULL for cpus and 0 for cpu_count as a shortcut for all + // online CPUs + riscv_hwprobe probes[2] = {{-1, 1}, {-1, 1}}; + ASSERT_THAT(symbol(/*pairs=*/probes, /*count=*/2, /*cpusetsize=*/0, + /*cpuset=*/nullptr, + /*flags=*/0), + Succeeds()); + for (auto &probe : probes) { + EXPECT_EQ(probe.key, static_cast<decltype(probe.key)>(-1)); + EXPECT_EQ(probe.value, static_cast<decltype(probe.value)>(0)); + } +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/compiler/CMakeLists.txt b/libc/test/src/compiler/CMakeLists.txt index 65a9acc..a45fa8c 100644 --- a/libc/test/src/compiler/CMakeLists.txt +++ b/libc/test/src/compiler/CMakeLists.txt @@ -7,6 +7,7 @@ add_libc_unittest( SRCS stack_chk_guard_test.cpp DEPENDS + libc.hdr.signal_macros libc.src.__support.macros.sanitizer libc.src.compiler.__stack_chk_fail libc.src.string.memset diff --git a/libc/test/src/compiler/stack_chk_guard_test.cpp b/libc/test/src/compiler/stack_chk_guard_test.cpp index 6b71e15..4ec8398 100644 --- a/libc/test/src/compiler/stack_chk_guard_test.cpp +++ b/libc/test/src/compiler/stack_chk_guard_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "include/llvm-libc-macros/signal-macros.h" +#include "hdr/signal_macros.h" #include "src/__support/macros/sanitizer.h" #include "src/compiler/__stack_chk_fail.h" #include "src/string/memset.h" @@ -18,7 +18,7 @@ TEST(LlvmLibcStackChkFail, Death) { // Disable the test when asan is enabled so that it doesn't immediately fail // after the memset, but before the stack canary is re-checked. -#ifndef LIBC_HAVE_ADDRESS_SANITIZER +#ifndef LIBC_HAS_ADDRESS_SANITIZER TEST(LlvmLibcStackChkFail, Smash) { EXPECT_DEATH( [] { @@ -27,4 +27,4 @@ TEST(LlvmLibcStackChkFail, Smash) { }, WITH_SIGNAL(SIGABRT)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 7271e93..e943d98 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -2895,9 +2895,10 @@ add_fp_unittest( SRCS nanf_test.cpp DEPENDS - libc.include.signal + libc.hdr.signal_macros libc.src.math.nanf libc.src.__support.FPUtil.fp_bits + libc.src.__support.macros.sanitizer # FIXME: The nan tests currently have death tests, which aren't supported for # hermetic tests. UNIT_TEST_ONLY @@ -2910,9 +2911,10 @@ add_fp_unittest( SRCS nan_test.cpp DEPENDS - libc.include.signal + libc.hdr.signal_macros libc.src.math.nan libc.src.__support.FPUtil.fp_bits + libc.src.__support.macros.sanitizer # FIXME: The nan tests currently have death tests, which aren't supported for # hermetic tests. UNIT_TEST_ONLY @@ -2925,9 +2927,10 @@ add_fp_unittest( SRCS nanl_test.cpp DEPENDS - libc.include.signal + libc.hdr.signal_macros libc.src.math.nanl libc.src.__support.FPUtil.fp_bits + libc.src.__support.macros.sanitizer # FIXME: The nan tests currently have death tests, which aren't supported for # hermetic tests. UNIT_TEST_ONLY @@ -2940,7 +2943,7 @@ add_fp_unittest( SRCS nanf16_test.cpp DEPENDS - libc.include.signal + libc.hdr.signal_macros libc.src.math.nanf16 libc.src.__support.FPUtil.fp_bits libc.src.__support.macros.sanitizer @@ -2956,9 +2959,10 @@ add_fp_unittest( SRCS nanf128_test.cpp DEPENDS - libc.include.signal + libc.hdr.signal_macros libc.src.math.nanf128 libc.src.__support.FPUtil.fp_bits + libc.src.__support.macros.sanitizer # FIXME: The nan tests currently have death tests, which aren't supported for # hermetic tests. UNIT_TEST_ONLY diff --git a/libc/test/src/math/smoke/nan_test.cpp b/libc/test/src/math/smoke/nan_test.cpp index 68c8441..46b9e9a 100644 --- a/libc/test/src/math/smoke/nan_test.cpp +++ b/libc/test/src/math/smoke/nan_test.cpp @@ -6,12 +6,13 @@ // //===----------------------------------------------------------------------===// +#include "hdr/signal_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/sanitizer.h" #include "src/math/nan.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include <signal.h> class LlvmLibcNanTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { public: @@ -43,8 +44,8 @@ TEST_F(LlvmLibcNanTest, RandomString) { run_test("123 ", 0x7ff8000000000000); } -#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) TEST_F(LlvmLibcNanTest, InvalidInput) { EXPECT_DEATH([] { LIBC_NAMESPACE::nan(nullptr); }, WITH_SIGNAL(SIGSEGV)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanf128_test.cpp b/libc/test/src/math/smoke/nanf128_test.cpp index 015cc31..25dd2ef 100644 --- a/libc/test/src/math/smoke/nanf128_test.cpp +++ b/libc/test/src/math/smoke/nanf128_test.cpp @@ -6,7 +6,9 @@ // //===----------------------------------------------------------------------===// +#include "hdr/signal_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/sanitizer.h" #include "src/__support/uint128.h" #include "src/math/nanf128.h" #include "test/UnitTest/FEnvSafeTest.h" @@ -53,9 +55,8 @@ TEST_F(LlvmLibcNanf128Test, RandomString) { QUIET_NAN); } -#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) -#include <signal.h> +#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) TEST_F(LlvmLibcNanf128Test, InvalidInput) { EXPECT_DEATH([] { LIBC_NAMESPACE::nanf128(nullptr); }, WITH_SIGNAL(SIGSEGV)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanf16_test.cpp b/libc/test/src/math/smoke/nanf16_test.cpp index 81b844b..ec640a3 100644 --- a/libc/test/src/math/smoke/nanf16_test.cpp +++ b/libc/test/src/math/smoke/nanf16_test.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "hdr/signal_macros.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/macros/sanitizer.h" #include "src/math/nanf16.h" @@ -13,8 +14,6 @@ #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include <signal.h> - class LlvmLibcNanf16Test : public LIBC_NAMESPACE::testing::FEnvSafeTest { public: using StorageType = LIBC_NAMESPACE::fputil::FPBits<float16>::StorageType; @@ -44,8 +43,8 @@ TEST_F(LlvmLibcNanf16Test, RandomString) { run_test("123 ", 0x7e00); } -#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) TEST_F(LlvmLibcNanf16Test, InvalidInput) { EXPECT_DEATH([] { LIBC_NAMESPACE::nanf16(nullptr); }, WITH_SIGNAL(SIGSEGV)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanf_test.cpp b/libc/test/src/math/smoke/nanf_test.cpp index ff58236..dd3124e 100644 --- a/libc/test/src/math/smoke/nanf_test.cpp +++ b/libc/test/src/math/smoke/nanf_test.cpp @@ -6,12 +6,13 @@ // //===----------------------------------------------------------------------===// +#include "hdr/signal_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/sanitizer.h" #include "src/math/nanf.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include <signal.h> class LlvmLibcNanfTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { public: @@ -42,8 +43,8 @@ TEST_F(LlvmLibcNanfTest, RandomString) { run_test("123 ", 0x7fc00000); } -#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) TEST_F(LlvmLibcNanfTest, InvalidInput) { EXPECT_DEATH([] { LIBC_NAMESPACE::nanf(nullptr); }, WITH_SIGNAL(SIGSEGV)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanl_test.cpp b/libc/test/src/math/smoke/nanl_test.cpp index de9af05..ef3f9c1 100644 --- a/libc/test/src/math/smoke/nanl_test.cpp +++ b/libc/test/src/math/smoke/nanl_test.cpp @@ -6,12 +6,13 @@ // //===----------------------------------------------------------------------===// +#include "hdr/signal_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/sanitizer.h" #include "src/math/nanl.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include <signal.h> #if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64) #define SELECT_LONG_DOUBLE(val, _, __) val @@ -70,8 +71,8 @@ TEST_F(LlvmLibcNanlTest, RandomString) { run_test("123 ", expected); } -#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) TEST_F(LlvmLibcNanlTest, InvalidInput) { EXPECT_DEATH([] { LIBC_NAMESPACE::nanl(nullptr); }, WITH_SIGNAL(SIGSEGV)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index a571832..ffff811 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -925,7 +925,6 @@ set(files exception execution expected - experimental/__config experimental/__simd/aligned_tag.h experimental/__simd/declaration.h experimental/__simd/reference.h diff --git a/libcxx/include/__config b/libcxx/include/__config index b0a5dda..f0a9243 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -591,6 +591,15 @@ typedef __char32_t char32_t; inline namespace _LIBCPP_ABI_NAMESPACE { # define _LIBCPP_END_NAMESPACE_STD }} _LIBCPP_POP_EXTENSION_DIAGNOSTICS +#define _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL namespace std { namespace experimental { +#define _LIBCPP_END_NAMESPACE_EXPERIMENTAL }} + +#define _LIBCPP_BEGIN_NAMESPACE_LFTS _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v1 { +#define _LIBCPP_END_NAMESPACE_LFTS } _LIBCPP_END_NAMESPACE_EXPERIMENTAL + +#define _LIBCPP_BEGIN_NAMESPACE_LFTS_V2 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v2 { +#define _LIBCPP_END_NAMESPACE_LFTS_V2 } _LIBCPP_END_NAMESPACE_EXPERIMENTAL + #ifdef _LIBCPP_ABI_NO_FILESYSTEM_INLINE_NAMESPACE # define _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM _LIBCPP_BEGIN_NAMESPACE_STD namespace filesystem { # define _LIBCPP_END_NAMESPACE_FILESYSTEM } _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__pstl/backend.h b/libcxx/include/__pstl/backend.h index 86d9f28..5980b07 100644 --- a/libcxx/include/__pstl/backend.h +++ b/libcxx/include/__pstl/backend.h @@ -19,16 +19,20 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> -#if defined(_LIBCPP_PSTL_BACKEND_SERIAL) -# include <__pstl/backends/default.h> -# include <__pstl/backends/serial.h> -#elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) -# include <__pstl/backends/default.h> -# include <__pstl/backends/std_thread.h> -#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) -# include <__pstl/backends/default.h> -# include <__pstl/backends/libdispatch.h> -#endif +#if _LIBCPP_STD_VER >= 17 + +# if defined(_LIBCPP_PSTL_BACKEND_SERIAL) +# include <__pstl/backends/default.h> +# include <__pstl/backends/serial.h> +# elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) +# include <__pstl/backends/default.h> +# include <__pstl/backends/std_thread.h> +# elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) +# include <__pstl/backends/default.h> +# include <__pstl/backends/libdispatch.h> +# endif + +#endif // _LIBCPP_STD_VER >= 17 _LIBCPP_POP_MACROS diff --git a/libcxx/include/__pstl/backend_fwd.h b/libcxx/include/__pstl/backend_fwd.h index 32c5da5..2132e8d 100644 --- a/libcxx/include/__pstl/backend_fwd.h +++ b/libcxx/include/__pstl/backend_fwd.h @@ -39,6 +39,8 @@ _LIBCPP_PUSH_MACROS // the user. // +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -50,18 +52,18 @@ struct __libdispatch_backend_tag; struct __serial_backend_tag; struct __std_thread_backend_tag; -#if defined(_LIBCPP_PSTL_BACKEND_SERIAL) +# if defined(_LIBCPP_PSTL_BACKEND_SERIAL) using __current_configuration = __backend_configuration<__serial_backend_tag, __default_backend_tag>; -#elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) +# elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) using __current_configuration = __backend_configuration<__std_thread_backend_tag, __default_backend_tag>; -#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) +# elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) using __current_configuration = __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>; -#else +# else // ...New vendors can add parallel backends here... -# error "Invalid PSTL backend configuration" -#endif +# error "Invalid PSTL backend configuration" +# endif template <class _Backend, class _ExecutionPolicy> struct __find_if; @@ -296,6 +298,8 @@ struct __reduce; } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_BACKEND_FWD_H diff --git a/libcxx/include/__pstl/backends/default.h b/libcxx/include/__pstl/backends/default.h index b655da5..3672bbf 100644 --- a/libcxx/include/__pstl/backends/default.h +++ b/libcxx/include/__pstl/backends/default.h @@ -33,6 +33,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -498,6 +500,8 @@ struct __rotate_copy<__default_backend_tag, _ExecutionPolicy> { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_BACKENDS_DEFAULT_H diff --git a/libcxx/include/__pstl/backends/libdispatch.h b/libcxx/include/__pstl/backends/libdispatch.h index a0c3ad9..2d6ab49 100644 --- a/libcxx/include/__pstl/backends/libdispatch.h +++ b/libcxx/include/__pstl/backends/libdispatch.h @@ -44,6 +44,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -140,15 +142,15 @@ struct __cpu_traits<__libdispatch_backend_tag> { unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges( [&]() -> __merge_range_t* { -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -#endif +# endif return std::allocator<__merge_range_t>().allocate(__n_ranges); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (const std::bad_alloc&) { return nullptr; } -#endif +# endif }(), __destroy); @@ -392,6 +394,8 @@ struct __fill<__libdispatch_backend_tag, _ExecutionPolicy> } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H diff --git a/libcxx/include/__pstl/backends/serial.h b/libcxx/include/__pstl/backends/serial.h index 5f24499..f414201 100644 --- a/libcxx/include/__pstl/backends/serial.h +++ b/libcxx/include/__pstl/backends/serial.h @@ -30,6 +30,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -176,6 +178,8 @@ struct __transform_reduce_binary<__serial_backend_tag, _ExecutionPolicy> { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_BACKENDS_SERIAL_H diff --git a/libcxx/include/__pstl/backends/std_thread.h b/libcxx/include/__pstl/backends/std_thread.h index 49570bd..19b985f 100644 --- a/libcxx/include/__pstl/backends/std_thread.h +++ b/libcxx/include/__pstl/backends/std_thread.h @@ -32,6 +32,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -131,6 +133,8 @@ struct __fill<__std_thread_backend_tag, _ExecutionPolicy> } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_BACKENDS_STD_THREAD_H diff --git a/libcxx/include/__pstl/cpu_algos/any_of.h b/libcxx/include/__pstl/cpu_algos/any_of.h index b33c787..803db79 100644 --- a/libcxx/include/__pstl/cpu_algos/any_of.h +++ b/libcxx/include/__pstl/cpu_algos/any_of.h @@ -26,6 +26,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -94,6 +96,8 @@ struct __cpu_parallel_any_of { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_ANY_OF_H diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h index 0483d691..5e59752 100644 --- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h +++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h @@ -19,6 +19,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -81,6 +83,8 @@ struct __cpu_traits; } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H diff --git a/libcxx/include/__pstl/cpu_algos/fill.h b/libcxx/include/__pstl/cpu_algos/fill.h index 4e6d29b..3e59365 100644 --- a/libcxx/include/__pstl/cpu_algos/fill.h +++ b/libcxx/include/__pstl/cpu_algos/fill.h @@ -23,6 +23,8 @@ # pragma GCC system_header #endif +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -63,4 +65,6 @@ struct __cpu_parallel_fill { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + #endif // _LIBCPP___PSTL_CPU_ALGOS_FILL_H diff --git a/libcxx/include/__pstl/cpu_algos/find_if.h b/libcxx/include/__pstl/cpu_algos/find_if.h index 12b2e88..cd92e5a 100644 --- a/libcxx/include/__pstl/cpu_algos/find_if.h +++ b/libcxx/include/__pstl/cpu_algos/find_if.h @@ -31,6 +31,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -132,6 +134,8 @@ struct __cpu_parallel_find_if { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_FIND_IF_H diff --git a/libcxx/include/__pstl/cpu_algos/for_each.h b/libcxx/include/__pstl/cpu_algos/for_each.h index d4d7862..cec719b 100644 --- a/libcxx/include/__pstl/cpu_algos/for_each.h +++ b/libcxx/include/__pstl/cpu_algos/for_each.h @@ -23,6 +23,8 @@ # pragma GCC system_header #endif +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -63,4 +65,6 @@ struct __cpu_parallel_for_each { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + #endif // _LIBCPP___PSTL_CPU_ALGOS_FOR_EACH_H diff --git a/libcxx/include/__pstl/cpu_algos/merge.h b/libcxx/include/__pstl/cpu_algos/merge.h index dfa4cbf..a9069ca 100644 --- a/libcxx/include/__pstl/cpu_algos/merge.h +++ b/libcxx/include/__pstl/cpu_algos/merge.h @@ -26,6 +26,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -80,6 +82,8 @@ struct __cpu_parallel_merge { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_MERGE_H diff --git a/libcxx/include/__pstl/cpu_algos/stable_sort.h b/libcxx/include/__pstl/cpu_algos/stable_sort.h index 8e64f3e..5afdd3f 100644 --- a/libcxx/include/__pstl/cpu_algos/stable_sort.h +++ b/libcxx/include/__pstl/cpu_algos/stable_sort.h @@ -21,6 +21,8 @@ # pragma GCC system_header #endif +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -44,4 +46,6 @@ struct __cpu_parallel_stable_sort { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + #endif // _LIBCPP___PSTL_CPU_ALGOS_STABLE_SORT_H diff --git a/libcxx/include/__pstl/cpu_algos/transform.h b/libcxx/include/__pstl/cpu_algos/transform.h index 27ce8e2..979121b 100644 --- a/libcxx/include/__pstl/cpu_algos/transform.h +++ b/libcxx/include/__pstl/cpu_algos/transform.h @@ -27,6 +27,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -148,6 +150,8 @@ struct __cpu_parallel_transform_binary { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_H diff --git a/libcxx/include/__pstl/cpu_algos/transform_reduce.h b/libcxx/include/__pstl/cpu_algos/transform_reduce.h index 36ac1a9..aafbf1c 100644 --- a/libcxx/include/__pstl/cpu_algos/transform_reduce.h +++ b/libcxx/include/__pstl/cpu_algos/transform_reduce.h @@ -31,6 +31,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -211,6 +213,8 @@ struct __cpu_parallel_transform_reduce { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_REDUCE_H diff --git a/libcxx/include/__pstl/dispatch.h b/libcxx/include/__pstl/dispatch.h index 5e903f7..ea40fa7 100644 --- a/libcxx/include/__pstl/dispatch.h +++ b/libcxx/include/__pstl/dispatch.h @@ -23,6 +23,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -61,6 +63,8 @@ using __dispatch = typename __find_first_implemented<_Algorithm, _BackendConfigu } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_DISPATCH_H diff --git a/libcxx/include/__pstl/handle_exception.h b/libcxx/include/__pstl/handle_exception.h index d627095..57dfcfd 100644 --- a/libcxx/include/__pstl/handle_exception.h +++ b/libcxx/include/__pstl/handle_exception.h @@ -22,6 +22,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -52,6 +54,8 @@ _LIBCPP_HIDE_FROM_ABI auto __handle_exception(_Args&&... __args) { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_HANDLE_EXCEPTION_H diff --git a/libcxx/include/experimental/__config b/libcxx/include/experimental/__config deleted file mode 100644 index 7b23791..0000000 --- a/libcxx/include/experimental/__config +++ /dev/null @@ -1,45 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP_EXPERIMENTAL_CONFIG -#define _LIBCPP_EXPERIMENTAL_CONFIG - -#include <__config> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -#define _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL \ - namespace std { \ - namespace experimental { -#define _LIBCPP_END_NAMESPACE_EXPERIMENTAL \ - } \ - } - -#define _LIBCPP_BEGIN_NAMESPACE_LFTS _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v1 { -#define _LIBCPP_END_NAMESPACE_LFTS \ - } \ - } \ - } - -#define _LIBCPP_BEGIN_NAMESPACE_LFTS_V2 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v2 { -#define _LIBCPP_END_NAMESPACE_LFTS_V2 \ - } \ - } \ - } - -// TODO: support more targets -#if defined(__AVX__) -# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 32 -#else -# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 16 -#endif - -#endif diff --git a/libcxx/include/experimental/__simd/aligned_tag.h b/libcxx/include/experimental/__simd/aligned_tag.h index 31d2b50..e364e14 100644 --- a/libcxx/include/experimental/__simd/aligned_tag.h +++ b/libcxx/include/experimental/__simd/aligned_tag.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H #define _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H +#include <__config> #include <__memory/assume_aligned.h> #include <__type_traits/remove_const.h> #include <cstddef> -#include <experimental/__config> #include <experimental/__simd/traits.h> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/experimental/__simd/declaration.h b/libcxx/include/experimental/__simd/declaration.h index 7b45d03..2ac7224 100644 --- a/libcxx/include/experimental/__simd/declaration.h +++ b/libcxx/include/experimental/__simd/declaration.h @@ -10,11 +10,18 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H #define _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H +#include <__config> #include <cstddef> -#include <experimental/__config> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) +// TODO: support more targets +# if defined(__AVX__) +# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 32 +# else +# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 16 +# endif + _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace parallelism_v2 { namespace simd_abi { diff --git a/libcxx/include/experimental/__simd/reference.h b/libcxx/include/experimental/__simd/reference.h index c60c08b..cba460b 100644 --- a/libcxx/include/experimental/__simd/reference.h +++ b/libcxx/include/experimental/__simd/reference.h @@ -10,6 +10,7 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H #define _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H +#include <__config> #include <__type_traits/enable_if.h> #include <__type_traits/is_assignable.h> #include <__type_traits/is_same.h> @@ -17,7 +18,6 @@ #include <__utility/forward.h> #include <__utility/move.h> #include <cstddef> -#include <experimental/__config> #include <experimental/__simd/utility.h> _LIBCPP_PUSH_MACROS diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h index a2aeeb5..a76933e 100644 --- a/libcxx/include/experimental/__simd/scalar.h +++ b/libcxx/include/experimental/__simd/scalar.h @@ -11,9 +11,9 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_SCALAR_H #include <__assert> +#include <__config> #include <__type_traits/integral_constant.h> #include <cstddef> -#include <experimental/__config> #include <experimental/__simd/declaration.h> #include <experimental/__simd/traits.h> diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h index db0f9b3..2c65d19 100644 --- a/libcxx/include/experimental/__simd/simd.h +++ b/libcxx/include/experimental/__simd/simd.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H +#include <__config> #include <__type_traits/enable_if.h> #include <__type_traits/is_same.h> #include <__type_traits/remove_cvref.h> #include <__utility/forward.h> #include <cstddef> -#include <experimental/__config> #include <experimental/__simd/declaration.h> #include <experimental/__simd/reference.h> #include <experimental/__simd/traits.h> diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h index d54d489..5527319 100644 --- a/libcxx/include/experimental/__simd/simd_mask.h +++ b/libcxx/include/experimental/__simd/simd_mask.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H +#include <__config> #include <__type_traits/enable_if.h> #include <__type_traits/is_same.h> #include <cstddef> -#include <experimental/__config> #include <experimental/__simd/declaration.h> #include <experimental/__simd/reference.h> #include <experimental/__simd/traits.h> diff --git a/libcxx/include/experimental/__simd/traits.h b/libcxx/include/experimental/__simd/traits.h index ec25b4b..b817df60 100644 --- a/libcxx/include/experimental/__simd/traits.h +++ b/libcxx/include/experimental/__simd/traits.h @@ -11,10 +11,10 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H #include <__bit/bit_ceil.h> +#include <__config> #include <__type_traits/integral_constant.h> #include <__type_traits/is_same.h> #include <cstddef> -#include <experimental/__config> #include <experimental/__simd/declaration.h> #include <experimental/__simd/utility.h> diff --git a/libcxx/include/experimental/__simd/utility.h b/libcxx/include/experimental/__simd/utility.h index 708fa3d..0103b06 100644 --- a/libcxx/include/experimental/__simd/utility.h +++ b/libcxx/include/experimental/__simd/utility.h @@ -10,6 +10,7 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H #define _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H +#include <__config> #include <__type_traits/is_arithmetic.h> #include <__type_traits/is_const.h> #include <__type_traits/is_constant_evaluated.h> @@ -22,7 +23,6 @@ #include <__utility/integer_sequence.h> #include <cstddef> #include <cstdint> -#include <experimental/__config> #include <limits> _LIBCPP_PUSH_MACROS diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h index 5787f23..1f707cf 100644 --- a/libcxx/include/experimental/__simd/vec_ext.h +++ b/libcxx/include/experimental/__simd/vec_ext.h @@ -12,11 +12,11 @@ #include <__assert> #include <__bit/bit_ceil.h> +#include <__config> #include <__type_traits/integral_constant.h> #include <__utility/forward.h> #include <__utility/integer_sequence.h> #include <cstddef> -#include <experimental/__config> #include <experimental/__simd/declaration.h> #include <experimental/__simd/traits.h> #include <experimental/__simd/utility.h> diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator index de82da2..edfe6e7 100644 --- a/libcxx/include/experimental/iterator +++ b/libcxx/include/experimental/iterator @@ -52,11 +52,11 @@ namespace std { */ +#include <__config> #include <__memory/addressof.h> #include <__type_traits/decay.h> #include <__utility/forward.h> #include <__utility/move.h> -#include <experimental/__config> #include <iterator> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/experimental/memory b/libcxx/include/experimental/memory index e9663d4..bf8a154 100644 --- a/libcxx/include/experimental/memory +++ b/libcxx/include/experimental/memory @@ -49,6 +49,7 @@ public: } */ +#include <__config> #include <__functional/hash.h> #include <__functional/operations.h> #include <__type_traits/add_lvalue_reference.h> @@ -57,7 +58,6 @@ public: #include <__type_traits/enable_if.h> #include <__type_traits/is_convertible.h> #include <cstddef> -#include <experimental/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/experimental/propagate_const b/libcxx/include/experimental/propagate_const index d7a695d..510d374 100644 --- a/libcxx/include/experimental/propagate_const +++ b/libcxx/include/experimental/propagate_const @@ -107,6 +107,7 @@ */ +#include <__config> #include <__functional/operations.h> #include <__fwd/functional.h> #include <__type_traits/conditional.h> @@ -128,7 +129,6 @@ #include <__utility/move.h> #include <__utility/swap.h> #include <cstddef> -#include <experimental/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/experimental/simd b/libcxx/include/experimental/simd index 484543b..35120b4 100644 --- a/libcxx/include/experimental/simd +++ b/libcxx/include/experimental/simd @@ -75,7 +75,7 @@ inline namespace parallelism_v2 { # pragma GCC system_header #endif -#include <experimental/__config> +#include <__config> #include <experimental/__simd/aligned_tag.h> #include <experimental/__simd/declaration.h> #include <experimental/__simd/reference.h> diff --git a/libcxx/include/experimental/type_traits b/libcxx/include/experimental/type_traits index 31b041b..a4bb59a 100644 --- a/libcxx/include/experimental/type_traits +++ b/libcxx/include/experimental/type_traits @@ -68,7 +68,7 @@ inline namespace fundamentals_v1 { */ -#include <experimental/__config> +#include <__config> #if _LIBCPP_STD_VER >= 14 diff --git a/libcxx/include/experimental/utility b/libcxx/include/experimental/utility index 8bd0a05..cbc7ad1 100644 --- a/libcxx/include/experimental/utility +++ b/libcxx/include/experimental/utility @@ -30,7 +30,7 @@ inline namespace fundamentals_v1 { */ -#include <experimental/__config> +#include <__config> #include <utility> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 65df579..add8726d 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -610,10 +610,6 @@ module std_experimental [system] { header "experimental/utility" export * } - module __config { - textual header "experimental/__config" - export * - } } // Convenience method to get all of the above modules in a single import statement. diff --git a/libcxx/src/any.cpp b/libcxx/src/any.cpp index b0ad695..eaca2dd 100644 --- a/libcxx/src/any.cpp +++ b/libcxx/src/any.cpp @@ -12,7 +12,7 @@ namespace std { const char* bad_any_cast::what() const noexcept { return "bad any cast"; } } // namespace std -#include <experimental/__config> +#include <__config> // Preserve std::experimental::any_bad_cast for ABI compatibility // Even though it no longer exists in a header file diff --git a/libcxx/src/optional.cpp b/libcxx/src/optional.cpp index 62b474a..4e7e288 100644 --- a/libcxx/src/optional.cpp +++ b/libcxx/src/optional.cpp @@ -17,7 +17,7 @@ const char* bad_optional_access::what() const noexcept { return "bad_optional_ac } // namespace std -#include <experimental/__config> +#include <__config> // Preserve std::experimental::bad_optional_access for ABI compatibility // Even though it no longer exists in a header file diff --git a/lld/test/wasm/static-error.s b/lld/test/wasm/static-error.s new file mode 100644 index 0000000..3557506 --- /dev/null +++ b/lld/test/wasm/static-error.s @@ -0,0 +1,12 @@ +// RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o +// RUN: wasm-ld --experimental-pic -shared -o %t.so %t.o + +// RUN: wasm-ld --experimental-pic -pie -o /dev/null %t.o %t.so +// RUN: not wasm-ld -o /dev/null -static %t.o %t.so 2>&1 | FileCheck %s + +// CHECK: attempted static link of dynamic object + +.global _start +_start: + .functype _start () -> () + end_function diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index cb8fe25..2de7dca 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -333,9 +333,15 @@ void LinkerDriver::addFile(StringRef path) { return; } case file_magic::bitcode: - case file_magic::wasm_object: - files.push_back(createObjectFile(mbref, "", 0, inLib)); + case file_magic::wasm_object: { + auto obj = createObjectFile(mbref, "", 0, inLib); + if (config->isStatic && isa<SharedFile>(obj)) { + error("attempted static link of dynamic object " + path); + break; + } + files.push_back(obj); break; + } case file_magic::unknown: if (mbref.getBuffer().starts_with("#STUB")) { files.push_back(make<StubFile>(mbref)); diff --git a/lldb/include/lldb/API/SBMemoryRegionInfo.h b/lldb/include/lldb/API/SBMemoryRegionInfo.h index be55de4..f9a5dc9 100644 --- a/lldb/include/lldb/API/SBMemoryRegionInfo.h +++ b/lldb/include/lldb/API/SBMemoryRegionInfo.h @@ -120,7 +120,7 @@ public: private: friend class SBProcess; friend class SBMemoryRegionInfoList; - + friend class SBSaveCoreOptions; friend class lldb_private::ScriptInterpreter; lldb_private::MemoryRegionInfo &ref(); diff --git a/lldb/include/lldb/API/SBSaveCoreOptions.h b/lldb/include/lldb/API/SBSaveCoreOptions.h index ba48ba5..c076d3c 100644 --- a/lldb/include/lldb/API/SBSaveCoreOptions.h +++ b/lldb/include/lldb/API/SBSaveCoreOptions.h @@ -80,6 +80,17 @@ public: /// \return True if the thread was removed, false if it was not in the list. bool RemoveThread(lldb::SBThread thread); + /// Add a memory region to save in the core file. + /// + /// \param region The memory region to save. + /// \returns An empty SBError upon success, or an error if the region is + /// invalid. + /// \note Ranges that overlapped will be unioned into a single region, this + /// also supercedes stack minification. Specifying full regions and a + /// non-custom core style will include the specified regions and union them + /// with all style specific regions. + SBError AddMemoryRegionToSave(const SBMemoryRegionInfo ®ion); + /// Reset all options. void Clear(); diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h index 172824d..e386271 100644 --- a/lldb/include/lldb/Core/SourceManager.h +++ b/lldb/include/lldb/Core/SourceManager.h @@ -74,6 +74,10 @@ public: const Checksum &GetChecksum() const { return m_checksum; } + llvm::once_flag &GetChecksumWarningOnceFlag() { + return m_checksum_warning_once_flag; + } + protected: /// Set file and update modification time. void SetSupportFile(lldb::SupportFileSP support_file_sp); @@ -87,6 +91,9 @@ public: /// Keep track of the on-disk checksum. Checksum m_checksum; + /// Once flag for emitting a checksum mismatch warning. + llvm::once_flag m_checksum_warning_once_flag; + // Keep the modification time that this file data is valid for llvm::sys::TimePoint<> m_mod_time; diff --git a/lldb/include/lldb/Symbol/SaveCoreOptions.h b/lldb/include/lldb/Symbol/SaveCoreOptions.h index f4fed46..d90d080 100644 --- a/lldb/include/lldb/Symbol/SaveCoreOptions.h +++ b/lldb/include/lldb/Symbol/SaveCoreOptions.h @@ -10,13 +10,15 @@ #define LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H #include "lldb/Utility/FileSpec.h" -#include "lldb/lldb-forward.h" -#include "lldb/lldb-types.h" +#include "lldb/Utility/RangeMap.h" #include <optional> +#include <set> #include <string> #include <unordered_set> +using MemoryRanges = lldb_private::RangeVector<lldb::addr_t, lldb::addr_t>; + namespace lldb_private { class SaveCoreOptions { @@ -38,8 +40,12 @@ public: Status AddThread(lldb::ThreadSP thread_sp); bool RemoveThread(lldb::ThreadSP thread_sp); bool ShouldThreadBeSaved(lldb::tid_t tid) const; + bool HasSpecifiedThreads() const; Status EnsureValidConfiguration(lldb::ProcessSP process_sp) const; + const MemoryRanges &GetCoreFileMemoryRanges() const; + + void AddMemoryRegionToSave(const lldb_private::MemoryRegionInfo ®ion); void Clear(); @@ -51,6 +57,7 @@ private: std::optional<lldb::SaveCoreStyle> m_style; lldb::ProcessSP m_process_sp; std::unordered_set<lldb::tid_t> m_threads_to_save; + MemoryRanges m_regions_to_save; }; } // namespace lldb_private diff --git a/lldb/include/lldb/Target/CoreFileMemoryRanges.h b/lldb/include/lldb/Target/CoreFileMemoryRanges.h new file mode 100644 index 0000000..503ecd6 --- /dev/null +++ b/lldb/include/lldb/Target/CoreFileMemoryRanges.h @@ -0,0 +1,50 @@ +//===-- CoreFileMemoryRanges.h ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Utility/RangeMap.h" +#include "lldb/Utility/Status.h" + +#include "llvm/ADT/AddressRanges.h" + +#ifndef LLDB_TARGET_COREFILEMEMORYRANGES_H +#define LLDB_TARGET_COREFILEMEMORYRANGES_H + +namespace lldb_private { + +struct CoreFileMemoryRange { + llvm::AddressRange range; /// The address range to save into the core file. + uint32_t lldb_permissions; /// A bit set of lldb::Permissions bits. + + bool operator==(const CoreFileMemoryRange &rhs) const { + return range == rhs.range && lldb_permissions == rhs.lldb_permissions; + } + + bool operator!=(const CoreFileMemoryRange &rhs) const { + return !(*this == rhs); + } + + bool operator<(const CoreFileMemoryRange &rhs) const { + if (range < rhs.range) + return true; + if (range == rhs.range) + return lldb_permissions < rhs.lldb_permissions; + return false; + } +}; + +class CoreFileMemoryRanges + : public lldb_private::RangeDataVector<lldb::addr_t, lldb::addr_t, + CoreFileMemoryRange> { +public: + /// Finalize and merge all overlapping ranges in this collection. Ranges + /// will be seperated based on permissions. + Status FinalizeCoreFileSaveRanges(); +}; +} // namespace lldb_private + +#endif // LLDB_TARGET_COREFILEMEMORYRANGES_H diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index c66cfb2..b8c53a4 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -35,6 +35,8 @@ #include "lldb/Host/ProcessLaunchInfo.h" #include "lldb/Host/ProcessRunLock.h" #include "lldb/Symbol/ObjectFile.h" +#include "lldb/Symbol/SaveCoreOptions.h" +#include "lldb/Target/CoreFileMemoryRanges.h" #include "lldb/Target/ExecutionContextScope.h" #include "lldb/Target/InstrumentationRuntime.h" #include "lldb/Target/Memory.h" @@ -710,29 +712,6 @@ public: /// is not supported by the plugin, error otherwise. virtual llvm::Expected<bool> SaveCore(llvm::StringRef outfile); - struct CoreFileMemoryRange { - llvm::AddressRange range; /// The address range to save into the core file. - uint32_t lldb_permissions; /// A bit set of lldb::Permissions bits. - - bool operator==(const CoreFileMemoryRange &rhs) const { - return range == rhs.range && lldb_permissions == rhs.lldb_permissions; - } - - bool operator!=(const CoreFileMemoryRange &rhs) const { - return !(*this == rhs); - } - - bool operator<(const CoreFileMemoryRange &rhs) const { - if (range < rhs.range) - return true; - if (range == rhs.range) - return lldb_permissions < rhs.lldb_permissions; - return false; - } - }; - - using CoreFileMemoryRanges = std::vector<CoreFileMemoryRange>; - /// Helper function for Process::SaveCore(...) that calculates the address /// ranges that should be saved. This allows all core file plug-ins to save /// consistent memory ranges given a \a core_style. diff --git a/lldb/include/lldb/Utility/RangeMap.h b/lldb/include/lldb/Utility/RangeMap.h index 8cc382b..433466e 100644 --- a/lldb/include/lldb/Utility/RangeMap.h +++ b/lldb/include/lldb/Utility/RangeMap.h @@ -450,6 +450,12 @@ public: void Append(const Entry &entry) { m_entries.emplace_back(entry); } + /// Append a range with data to the vector + /// \param B The base of the memory range + /// \param S The size of the memory range + /// \param T The data associated with the memory range + void Append(B &&b, S &&s, T &&t) { m_entries.emplace_back(Entry(b, s, t)); } + bool Erase(uint32_t start, uint32_t end) { if (start >= end || end > m_entries.size()) return false; diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 7bfde8b..938f6e3 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -1222,6 +1222,7 @@ enum SaveCoreStyle { eSaveCoreFull = 1, eSaveCoreDirtyOnly = 2, eSaveCoreStackOnly = 3, + eSaveCoreCustomOnly = 4, }; /// Events that might happen during a trace session. diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h index 337eff6..5fb288a 100644 --- a/lldb/include/lldb/lldb-forward.h +++ b/lldb/include/lldb/lldb-forward.h @@ -207,6 +207,7 @@ class StackFrameRecognizer; class StackFrameRecognizerManager; class StackID; class Status; +class SaveCoreOptions; class StopInfo; class Stoppoint; class StoppointCallbackContext; diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h index b3c8cda..5bac5cd 100644 --- a/lldb/include/lldb/lldb-private-interfaces.h +++ b/lldb/include/lldb/lldb-private-interfaces.h @@ -9,7 +9,6 @@ #ifndef LLDB_LLDB_PRIVATE_INTERFACES_H #define LLDB_LLDB_PRIVATE_INTERFACES_H -#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/lldb-enumerations.h" #include "lldb/lldb-forward.h" #include "lldb/lldb-private-enumerations.h" diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp index ef82b02..c79b57f 100644 --- a/lldb/source/API/SBSaveCoreOptions.cpp +++ b/lldb/source/API/SBSaveCoreOptions.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "lldb/API/SBSaveCoreOptions.h" +#include "lldb/API/SBMemoryRegionInfo.h" #include "lldb/Host/FileSystem.h" #include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Utility/Instrumentation.h" @@ -89,6 +90,16 @@ bool SBSaveCoreOptions::RemoveThread(lldb::SBThread thread) { return m_opaque_up->RemoveThread(thread.GetSP()); } +lldb::SBError +SBSaveCoreOptions::AddMemoryRegionToSave(const SBMemoryRegionInfo ®ion) { + LLDB_INSTRUMENT_VA(this, region); + // Currently add memory region can't fail, so we always return a success + // SBerror, but because these API's live forever, this is the most future + // proof thing to do. + m_opaque_up->AddMemoryRegionToSave(region.ref()); + return SBError(); +} + void SBSaveCoreOptions::Clear() { LLDB_INSTRUMENT_VA(this); m_opaque_up->Clear(); diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index 25eb633..5b0f4f6 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -25,6 +25,7 @@ #include "lldb/Interpreter/OptionArgParser.h" #include "lldb/Interpreter/OptionGroupPythonClassWithDict.h" #include "lldb/Interpreter/Options.h" +#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Target/Platform.h" #include "lldb/Target/Process.h" #include "lldb/Target/StopInfo.h" diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp index f97d86a..fd5b499 100644 --- a/lldb/source/Core/SourceManager.cpp +++ b/lldb/source/Core/SourceManager.cpp @@ -61,6 +61,12 @@ static void resolve_tilde(FileSpec &file_spec) { } } +static std::string toString(const Checksum &checksum) { + if (!checksum) + return ""; + return std::string(llvm::formatv("{0}", checksum.digest())); +} + // SourceManager constructor SourceManager::SourceManager(const TargetSP &target_sp) : m_last_support_file_sp(std::make_shared<SupportFile>()), m_last_line(0), @@ -302,6 +308,18 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile( break; } } + + Checksum line_table_checksum = + last_file_sp->GetSupportFile()->GetChecksum(); + Checksum on_disk_checksum = last_file_sp->GetChecksum(); + if (line_table_checksum && line_table_checksum != on_disk_checksum) + Debugger::ReportWarning( + llvm::formatv( + "{0}: source file checksum mismatch between line table " + "({1}) and file on disk ({2})", + last_file_sp->GetSupportFile()->GetSpecOnly().GetFilename(), + toString(line_table_checksum), toString(on_disk_checksum)), + std::nullopt, &last_file_sp->GetChecksumWarningOnceFlag()); } return *delta; } @@ -837,12 +855,6 @@ SourceManager::FileSP SourceManager::SourceFileCache::FindSourceFile( return {}; } -static std::string toString(const Checksum &checksum) { - if (!checksum) - return ""; - return std::string(llvm::formatv("{0}", checksum.digest())); -} - void SourceManager::SourceFileCache::Dump(Stream &stream) const { // clang-format off stream << "Modification time MD5 Checksum (on-disk) MD5 Checksum (line table) Lines Path\n"; diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index b28beab..06da83e 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -6562,13 +6562,15 @@ bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp, } if (make_core) { - Process::CoreFileMemoryRanges core_ranges; + CoreFileMemoryRanges core_ranges; error = process_sp->CalculateCoreFileSaveRanges(options, core_ranges); if (error.Success()) { const uint32_t addr_byte_size = target_arch.GetAddressByteSize(); const ByteOrder byte_order = target_arch.GetByteOrder(); std::vector<llvm::MachO::segment_command_64> segment_load_commands; - for (const auto &core_range : core_ranges) { + for (const auto &core_range_info : core_ranges) { + // TODO: Refactor RangeDataVector to have a data iterator. + const auto &core_range = core_range_info.data; uint32_t cmd_type = LC_SEGMENT_64; uint32_t segment_size = sizeof(llvm::MachO::segment_command_64); if (addr_byte_size == 4) { diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h index 27bc237..be87112 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h @@ -12,6 +12,7 @@ #include "lldb/Core/Address.h" #include "lldb/Host/SafeMachO.h" #include "lldb/Symbol/ObjectFile.h" +#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Utility/FileSpec.h" #include "lldb/Utility/FileSpecList.h" #include "lldb/Utility/RangeMap.h" diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp index 5c9ba22..edc568a 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp @@ -831,25 +831,32 @@ Status MinidumpFileBuilder::AddMemoryList() { // bytes of the core file. Thread structures in minidump files can only use // 32 bit memory descriptiors, so we emit them first to ensure the memory is // in accessible with a 32 bit offset. - Process::CoreFileMemoryRanges ranges_32; - Process::CoreFileMemoryRanges ranges_64; - Process::CoreFileMemoryRanges all_core_memory_ranges; + std::vector<CoreFileMemoryRange> ranges_32; + std::vector<CoreFileMemoryRange> ranges_64; + CoreFileMemoryRanges all_core_memory_ranges; error = m_process_sp->CalculateCoreFileSaveRanges(m_save_core_options, all_core_memory_ranges); + + std::vector<CoreFileMemoryRange> all_core_memory_vec; + // Extract all the data into just a vector of data. So we can mutate this in + // place. + for (const auto &core_range : all_core_memory_ranges) + all_core_memory_vec.push_back(core_range.data); + if (error.Fail()) return error; // Start by saving all of the stacks and ensuring they fit under the 32b // limit. uint64_t total_size = GetCurrentDataEndOffset(); - auto iterator = all_core_memory_ranges.begin(); - while (iterator != all_core_memory_ranges.end()) { + auto iterator = all_core_memory_vec.begin(); + while (iterator != all_core_memory_vec.end()) { if (m_saved_stack_ranges.count(iterator->range.start()) > 0) { // We don't save stacks twice. ranges_32.push_back(*iterator); total_size += iterator->range.size() + sizeof(llvm::minidump::MemoryDescriptor); - iterator = all_core_memory_ranges.erase(iterator); + iterator = all_core_memory_vec.erase(iterator); } else { iterator++; } @@ -869,11 +876,11 @@ Status MinidumpFileBuilder::AddMemoryList() { // Then anything overflow extends into 64b addressable space. // All core memeroy ranges will either container nothing on stacks only // or all the memory ranges including stacks - if (!all_core_memory_ranges.empty()) - total_size += 256 + (all_core_memory_ranges.size() * + if (!all_core_memory_vec.empty()) + total_size += 256 + (all_core_memory_vec.size() * sizeof(llvm::minidump::MemoryDescriptor_64)); - for (const auto &core_range : all_core_memory_ranges) { + for (const auto &core_range : all_core_memory_vec) { const addr_t range_size = core_range.range.size(); // We don't need to check for stacks here because we already removed them // from all_core_memory_ranges. @@ -958,15 +965,15 @@ Status MinidumpFileBuilder::DumpDirectories() const { } static uint64_t -GetLargestRangeSize(const Process::CoreFileMemoryRanges &ranges) { +GetLargestRangeSize(const std::vector<CoreFileMemoryRange> &ranges) { uint64_t max_size = 0; for (const auto &core_range : ranges) max_size = std::max(max_size, core_range.range.size()); return max_size; } -Status -MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) { +Status MinidumpFileBuilder::AddMemoryList_32( + std::vector<CoreFileMemoryRange> &ranges) { std::vector<MemoryDescriptor> descriptors; Status error; if (ranges.size() == 0) @@ -1042,8 +1049,8 @@ MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) { return error; } -Status -MinidumpFileBuilder::AddMemoryList_64(Process::CoreFileMemoryRanges &ranges) { +Status MinidumpFileBuilder::AddMemoryList_64( + std::vector<CoreFileMemoryRange> &ranges) { Status error; if (ranges.empty()) return error; diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h index 762de83..71001e2 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h @@ -23,6 +23,7 @@ #include <utility> #include <variant> +#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Target/Process.h" #include "lldb/Target/Target.h" #include "lldb/Utility/DataBufferHeap.h" @@ -120,9 +121,9 @@ private: lldb_private::Status AddData(const void *data, uint64_t size); // Add MemoryList stream, containing dumps of important memory segments lldb_private::Status - AddMemoryList_64(lldb_private::Process::CoreFileMemoryRanges &ranges); + AddMemoryList_64(std::vector<lldb_private::CoreFileMemoryRange> &ranges); lldb_private::Status - AddMemoryList_32(lldb_private::Process::CoreFileMemoryRanges &ranges); + AddMemoryList_32(std::vector<lldb_private::CoreFileMemoryRange> &ranges); // Update the thread list on disk with the newly emitted stack RVAs. lldb_private::Status FixThreadStacks(); lldb_private::Status FlushBufferToDisk(); diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h index b76fcd0..2f45f01 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h +++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h @@ -21,6 +21,7 @@ #define LLDB_SOURCE_PLUGINS_OBJECTFILE_MINIDUMP_OBJECTFILEMINIDUMP_H #include "lldb/Symbol/ObjectFile.h" +#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Utility/ArchSpec.h" class ObjectFileMinidump : public lldb_private::PluginInterface { diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp index 9d01089..8d9c919 100644 --- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp +++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp @@ -17,6 +17,7 @@ #include "lldb/Interpreter/OptionValueDictionary.h" #include "lldb/Interpreter/OptionValueProperties.h" #include "lldb/Symbol/ObjectFile.h" +#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Target/Process.h" #include "lldb/Target/SectionLoadList.h" #include "lldb/Target/Target.h" diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h index 8bccf3b..4f4dedf 100644 --- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h +++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h @@ -13,6 +13,7 @@ #include <vector> #include "lldb/Symbol/ObjectFile.h" +#include "lldb/Symbol/SaveCoreOptions.h" #include "llvm/Object/COFF.h" class ObjectFilePECOFF : public lldb_private::ObjectFile { diff --git a/lldb/source/Symbol/SaveCoreOptions.cpp b/lldb/source/Symbol/SaveCoreOptions.cpp index 35943726..8d9aade 100644 --- a/lldb/source/Symbol/SaveCoreOptions.cpp +++ b/lldb/source/Symbol/SaveCoreOptions.cpp @@ -102,6 +102,19 @@ bool SaveCoreOptions::ShouldThreadBeSaved(lldb::tid_t tid) const { return m_threads_to_save.count(tid) > 0; } +bool SaveCoreOptions::HasSpecifiedThreads() const { + return !m_threads_to_save.empty(); +} + +void SaveCoreOptions::AddMemoryRegionToSave( + const lldb_private::MemoryRegionInfo ®ion) { + m_regions_to_save.Insert(region.GetRange(), /*combine=*/true); +} + +const MemoryRanges &SaveCoreOptions::GetCoreFileMemoryRanges() const { + return m_regions_to_save; +} + Status SaveCoreOptions::EnsureValidConfiguration( lldb::ProcessSP process_sp) const { Status error; @@ -131,4 +144,5 @@ void SaveCoreOptions::Clear() { m_style = std::nullopt; m_threads_to_save.clear(); m_process_sp.reset(); + m_regions_to_save.Clear(); } diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt index a42c44b..a6d2eac 100644 --- a/lldb/source/Target/CMakeLists.txt +++ b/lldb/source/Target/CMakeLists.txt @@ -11,6 +11,7 @@ add_lldb_library(lldbTarget ABI.cpp AssertFrameRecognizer.cpp DynamicRegisterInfo.cpp + CoreFileMemoryRanges.cpp ExecutionContext.cpp InstrumentationRuntime.cpp InstrumentationRuntimeStopInfo.cpp diff --git a/lldb/source/Target/CoreFileMemoryRanges.cpp b/lldb/source/Target/CoreFileMemoryRanges.cpp new file mode 100644 index 0000000..6e4ca49 --- /dev/null +++ b/lldb/source/Target/CoreFileMemoryRanges.cpp @@ -0,0 +1,86 @@ +//===-- CoreFileMemoryRanges.cpp --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Target/CoreFileMemoryRanges.h" + +using namespace lldb; +using namespace lldb_private; + +using Entry = CoreFileMemoryRanges::Entry; + +static bool Overlaps(const Entry *region_one, const Entry *region_two) { + return !(region_one->GetRangeEnd() < region_two->GetRangeBase() || + region_two->GetRangeEnd() < region_one->GetRangeBase()); +} + +static bool IntersectHelper(const Entry *region_one, const Entry *region_two) { + return region_one->GetRangeBase() == region_two->GetRangeEnd() || + region_one->GetRangeEnd() == region_two->GetRangeBase(); +} + +static bool OnlyIntersects(const Entry *region_one, const Entry *region_two) { + return IntersectHelper(region_one, region_two) || + IntersectHelper(region_two, region_one); +} + +static bool PermissionsMatch(const Entry *region_one, const Entry *region_two) { + return region_one->data.lldb_permissions == region_two->data.lldb_permissions; +} + +// This assumes any overlapping ranges will share the same permissions +// and that adjacent ranges could have different permissions. +Status CoreFileMemoryRanges::FinalizeCoreFileSaveRanges() { + Status error; + this->Sort(); + for (size_t i = this->GetSize() - 1; i > 0; i--) { + auto region_one = this->GetMutableEntryAtIndex(i); + auto region_two = this->GetMutableEntryAtIndex(i - 1); + if (Overlaps(region_one, region_two)) { + // It's okay for interesecting regions to have different permissions but + // if they overlap we fail because we don't know what to do with them. + if (!PermissionsMatch(region_one, region_two)) { + // Permissions mismatch and it's not a simple intersection. + if (!OnlyIntersects(region_one, region_two)) { + error = Status::FromErrorStringWithFormatv( + "Memory region at {0}::{1} has different permssions than " + "overlapping region at {2}::{3}", + region_one->GetRangeBase(), region_one->GetRangeEnd(), + region_two->GetRangeBase(), region_two->GetRangeEnd()); + return error; + } + // Simple intersection, we can just not merge these. + else + continue; + } + const addr_t base = + std::min(region_one->GetRangeBase(), region_two->GetRangeBase()); + const addr_t byte_size = + std::max(region_one->GetRangeEnd(), region_two->GetRangeEnd()) - base; + + region_two->SetRangeBase(base); + region_two->SetByteSize(byte_size); + + // Because this is a range data vector, the entry has a base as well + // as the data contained in the entry. So we have to update both. + // And llvm::AddressRange isn't mutable so we have to create a new one. + llvm::AddressRange range(base, base + byte_size); + const CoreFileMemoryRange core_range = { + range, region_two->data.lldb_permissions}; + region_two->data = core_range; + // Erase is delete from [Inclusive, exclusive index). + if (!this->Erase(i, i + 1)) { + error = Status::FromErrorStringWithFormat( + "Core file memory ranges mutated outside of " + "CalculateCoreFileSaveRanges"); + return error; + } + } + } + + return error; +} diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 40f3115..aca0897 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -6463,7 +6463,7 @@ Status Process::WriteMemoryTags(lldb::addr_t addr, size_t len, } // Create a CoreFileMemoryRange from a MemoryRegionInfo -static Process::CoreFileMemoryRange +static CoreFileMemoryRange CreateCoreFileMemoryRange(const MemoryRegionInfo ®ion) { const addr_t addr = region.GetRange().GetRangeBase(); llvm::AddressRange range(addr, addr + region.GetRange().GetByteSize()); @@ -6474,7 +6474,7 @@ CreateCoreFileMemoryRange(const MemoryRegionInfo ®ion) { // were added. Return false if the dirty page information is not valid or in // the region. static bool AddDirtyPages(const MemoryRegionInfo ®ion, - Process::CoreFileMemoryRanges &ranges) { + CoreFileMemoryRanges &ranges) { const auto &dirty_page_list = region.GetDirtyPageList(); if (!dirty_page_list) return false; @@ -6494,14 +6494,14 @@ static bool AddDirtyPages(const MemoryRegionInfo ®ion, } else { // Add previous contiguous range and init the new range with the // current dirty page. - ranges.push_back({range, lldb_permissions}); + ranges.Append(range.start(), range.size(), {range, lldb_permissions}); range = llvm::AddressRange(page_addr, page_addr + page_size); } } } // The last range if (!range.empty()) - ranges.push_back({range, lldb_permissions}); + ranges.Append(range.start(), range.size(), {range, lldb_permissions}); return true; } @@ -6513,7 +6513,7 @@ static bool AddDirtyPages(const MemoryRegionInfo ®ion, // will be added to \a ranges, else the entire range will be added to \a // ranges. static void AddRegion(const MemoryRegionInfo ®ion, bool try_dirty_pages, - Process::CoreFileMemoryRanges &ranges) { + CoreFileMemoryRanges &ranges) { // Don't add empty ranges. if (region.GetRange().GetByteSize() == 0) return; @@ -6522,13 +6522,17 @@ static void AddRegion(const MemoryRegionInfo ®ion, bool try_dirty_pages, return; if (try_dirty_pages && AddDirtyPages(region, ranges)) return; - ranges.push_back(CreateCoreFileMemoryRange(region)); + + ranges.Append(region.GetRange().GetRangeBase(), + region.GetRange().GetByteSize(), + CreateCoreFileMemoryRange(region)); } -static void SaveOffRegionsWithStackPointers( - Process &process, const SaveCoreOptions &core_options, - const MemoryRegionInfos ®ions, Process::CoreFileMemoryRanges &ranges, - std::set<addr_t> &stack_ends) { +static void SaveOffRegionsWithStackPointers(Process &process, + const SaveCoreOptions &core_options, + const MemoryRegionInfos ®ions, + CoreFileMemoryRanges &ranges, + std::set<addr_t> &stack_ends) { const bool try_dirty_pages = true; // Before we take any dump, we want to save off the used portions of the @@ -6568,11 +6572,11 @@ static void SaveOffRegionsWithStackPointers( // for a full core file style. static void GetCoreFileSaveRangesFull(Process &process, const MemoryRegionInfos ®ions, - Process::CoreFileMemoryRanges &ranges, + CoreFileMemoryRanges &ranges, std::set<addr_t> &stack_ends) { // Don't add only dirty pages, add full regions. -const bool try_dirty_pages = false; + const bool try_dirty_pages = false; for (const auto ®ion : regions) if (stack_ends.count(region.GetRange().GetRangeEnd()) == 0) AddRegion(region, try_dirty_pages, ranges); @@ -6582,9 +6586,10 @@ const bool try_dirty_pages = false; // least some dirty pages, as some OS versions don't support reporting what // pages are dirty within an memory region. If no memory regions have dirty // page information fall back to saving out all ranges with write permissions. -static void GetCoreFileSaveRangesDirtyOnly( - Process &process, const MemoryRegionInfos ®ions, - Process::CoreFileMemoryRanges &ranges, std::set<addr_t> &stack_ends) { +static void GetCoreFileSaveRangesDirtyOnly(Process &process, + const MemoryRegionInfos ®ions, + CoreFileMemoryRanges &ranges, + std::set<addr_t> &stack_ends) { // Iterate over the regions and find all dirty pages. bool have_dirty_page_info = false; @@ -6613,9 +6618,10 @@ static void GetCoreFileSaveRangesDirtyOnly( // dirty regions as this will make the core file smaller. If the process // doesn't support dirty regions, then it will fall back to adding the full // stack region. -static void GetCoreFileSaveRangesStackOnly( - Process &process, const MemoryRegionInfos ®ions, - Process::CoreFileMemoryRanges &ranges, std::set<addr_t> &stack_ends) { +static void GetCoreFileSaveRangesStackOnly(Process &process, + const MemoryRegionInfos ®ions, + CoreFileMemoryRanges &ranges, + std::set<addr_t> &stack_ends) { const bool try_dirty_pages = true; // Some platforms support annotating the region information that tell us that // it comes from a thread stack. So look for those regions first. @@ -6628,6 +6634,24 @@ static void GetCoreFileSaveRangesStackOnly( } } +static void GetUserSpecifiedCoreFileSaveRanges(Process &process, + const MemoryRegionInfos ®ions, + const SaveCoreOptions &options, + CoreFileMemoryRanges &ranges) { + const auto &option_ranges = options.GetCoreFileMemoryRanges(); + if (option_ranges.IsEmpty()) + return; + + for (const auto &range : regions) { + auto entry = option_ranges.FindEntryThatContains(range.GetRange()); + if (entry) { + ranges.Append(range.GetRange().GetRangeBase(), + range.GetRange().GetByteSize(), + CreateCoreFileMemoryRange(range)); + } + } +} + Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options, CoreFileMemoryRanges &ranges) { lldb_private::MemoryRegionInfos regions; @@ -6643,11 +6667,18 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options, "callers must set the core_style to something other than " "eSaveCoreUnspecified"); + GetUserSpecifiedCoreFileSaveRanges(*this, regions, options, ranges); + std::set<addr_t> stack_ends; - SaveOffRegionsWithStackPointers(*this, options, regions, ranges, stack_ends); + // For fully custom set ups, we don't want to even look at threads if there + // are no threads specified. + if (core_style != lldb::eSaveCoreCustomOnly || options.HasSpecifiedThreads()) + SaveOffRegionsWithStackPointers(*this, options, regions, ranges, + stack_ends); switch (core_style) { case eSaveCoreUnspecified: + case eSaveCoreCustomOnly: break; case eSaveCoreFull: @@ -6666,10 +6697,11 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options, if (err.Fail()) return err; - if (ranges.empty()) - return Status("no valid address ranges found for core style"); + if (ranges.IsEmpty()) + return Status::FromErrorStringWithFormat( + "no valid address ranges found for core style"); - return Status(); // Success! + return ranges.FinalizeCoreFileSaveRanges(); } std::vector<ThreadSP> diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py index ed15793..2cbe20e 100644 --- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py +++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py @@ -344,3 +344,152 @@ class ProcessSaveCoreMinidumpTestCase(TestBase): self.assertTrue(self.dbg.DeleteTarget(target)) if os.path.isfile(default_value_file): os.unlink(default_value_file) + + @skipUnlessArch("x86_64") + @skipUnlessPlatform(["linux"]) + def test_save_linux_minidump_one_region(self): + """Test that we can save a Linux mini dump with one region in sbsavecore regions""" + + self.build() + exe = self.getBuildArtifact("a.out") + one_region_file = self.getBuildArtifact("core.one_region.dmp") + try: + target = self.dbg.CreateTarget(exe) + process = target.LaunchSimple( + None, None, self.get_process_working_directory() + ) + self.assertState(process.GetState(), lldb.eStateStopped) + + memory_region = lldb.SBMemoryRegionInfo() + memory_list = process.GetMemoryRegions() + memory_list.GetMemoryRegionAtIndex(0, memory_region) + + # This is almost identical to the single thread test case because + # minidump defaults to stacks only, so we want to see if the + # default options work as expected. + options = lldb.SBSaveCoreOptions() + file_spec = lldb.SBFileSpec(one_region_file) + options.SetOutputFile(file_spec) + options.SetPluginName("minidump") + options.AddMemoryRegionToSave(memory_region) + options.SetStyle(lldb.eSaveCoreCustomOnly) + error = process.SaveCore(options) + print(f"Error: {error.GetCString()}") + self.assertTrue(error.Success(), error.GetCString()) + + core_target = self.dbg.CreateTarget(None) + core_proc = core_target.LoadCore(one_region_file) + core_memory_list = core_proc.GetMemoryRegions() + # Note because the /proc/pid maps are included on linux, we can't + # depend on size for validation, so we'll ensure the first region + # is present and then assert we fail on the second. + core_memory_region = lldb.SBMemoryRegionInfo() + core_memory_list.GetMemoryRegionAtIndex(0, core_memory_region) + self.assertEqual( + core_memory_region.GetRegionBase(), memory_region.GetRegionBase() + ) + self.assertEqual( + core_memory_region.GetRegionEnd(), memory_region.GetRegionEnd() + ) + + region_two = lldb.SBMemoryRegionInfo() + core_memory_list.GetMemoryRegionAtIndex(1, region_two) + err = lldb.SBError() + content = core_proc.ReadMemory(region_two.GetRegionBase(), 1, err) + self.assertTrue(err.Fail(), "Should fail to read memory") + + finally: + self.assertTrue(self.dbg.DeleteTarget(target)) + if os.path.isfile(one_region_file): + os.unlink(one_region_file) + + @skipUnlessArch("x86_64") + @skipUnlessPlatform(["linux"]) + def test_save_minidump_custom_save_style(self): + """Test that verifies a custom and unspecified save style fails for + containing no data to save""" + + self.build() + exe = self.getBuildArtifact("a.out") + custom_file = self.getBuildArtifact("core.custom.dmp") + try: + target = self.dbg.CreateTarget(exe) + process = target.LaunchSimple( + None, None, self.get_process_working_directory() + ) + self.assertState(process.GetState(), lldb.eStateStopped) + + options = lldb.SBSaveCoreOptions() + options.SetOutputFile(lldb.SBFileSpec(custom_file)) + options.SetPluginName("minidump") + options.SetStyle(lldb.eSaveCoreCustomOnly) + + error = process.SaveCore(options) + self.assertTrue(error.Fail()) + self.assertEqual( + error.GetCString(), "no valid address ranges found for core style" + ) + + finally: + self.assertTrue(self.dbg.DeleteTarget(target)) + if os.path.isfile(custom_file): + os.unlink(custom_file) + + def save_core_with_region(self, process, region_index): + try: + custom_file = self.getBuildArtifact("core.custom.dmp") + memory_region = lldb.SBMemoryRegionInfo() + memory_list = process.GetMemoryRegions() + memory_list.GetMemoryRegionAtIndex(0, memory_region) + options = lldb.SBSaveCoreOptions() + options.SetOutputFile(lldb.SBFileSpec(custom_file)) + options.SetPluginName("minidump") + options.SetStyle(lldb.eSaveCoreFull) + + error = process.SaveCore(options) + self.assertTrue(error.Success()) + core_target = self.dbg.CreateTarget(None) + core_proc = core_target.LoadCore(custom_file) + core_memory_list = core_proc.GetMemoryRegions() + # proc/pid/ maps are included on linux, so we can't depend on size + # for validation, we make a set of all the ranges, + # and ensure no duplicates! + range_set = set() + for x in range(core_memory_list.GetSize()): + core_memory_region = lldb.SBMemoryRegionInfo() + core_memory_list.GetMemoryRegionAtIndex(x, core_memory_region) + mem_tuple = ( + core_memory_region.GetRegionBase(), + core_memory_region.GetRegionEnd(), + ) + self.assertTrue( + mem_tuple not in range_set, "Duplicate memory region found" + ) + range_set.add(mem_tuple) + finally: + if os.path.isfile(custom_file): + os.unlink(custom_file) + + @skipUnlessArch("x86_64") + @skipUnlessPlatform(["linux"]) + def test_save_minidump_custom_save_style_duplicated_regions(self): + """Test that verifies a custom and unspecified save style fails for + containing no data to save""" + + self.build() + exe = self.getBuildArtifact("a.out") + try: + target = self.dbg.CreateTarget(exe) + process = target.LaunchSimple( + None, None, self.get_process_working_directory() + ) + self.assertState(process.GetState(), lldb.eStateStopped) + + memory_list = process.GetMemoryRegions() + # Test that we don't duplicate regions, by duplicating regions + # at various indices. + self.save_core_with_region(process, 0) + self.save_core_with_region(process, len(memory_list) - 1) + + finally: + self.assertTrue(self.dbg.DeleteTarget(target)) diff --git a/lldb/test/Shell/SymbolFile/Inputs/main.c b/lldb/test/Shell/SymbolFile/Inputs/main.c new file mode 100644 index 0000000..341417f --- /dev/null +++ b/lldb/test/Shell/SymbolFile/Inputs/main.c @@ -0,0 +1,4 @@ +int main(int argc, char **argv) { + // Break on main. + return 1; +} diff --git a/lldb/test/Shell/SymbolFile/checksum-mismatch.test b/lldb/test/Shell/SymbolFile/checksum-mismatch.test new file mode 100644 index 0000000..5db9764 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/checksum-mismatch.test @@ -0,0 +1,7 @@ +RUN: mkdir -p %t +RUN: cp %S/Inputs/main.c %t/main.c +RUN: %clang_host %t/main.c -std=c99 -gdwarf-5 -o %t/main.out +RUN: echo "// Modify source file hash" >> %t/main.c +RUN: %lldb -b %t/main.out -o 'b main' -o 'r' 2>&1 | FileCheck %s + +CHECK: warning: main.c: source file checksum mismatch between line table ({{.*}}) and file on disk ({{.*}}) diff --git a/lldb/unittests/Process/Utility/CMakeLists.txt b/lldb/unittests/Process/Utility/CMakeLists.txt index 651f871..ec0ff95 100644 --- a/lldb/unittests/Process/Utility/CMakeLists.txt +++ b/lldb/unittests/Process/Utility/CMakeLists.txt @@ -18,6 +18,7 @@ add_lldb_unittest(ProcessUtilityTests LinuxProcMapsTest.cpp MemoryTagManagerAArch64MTETest.cpp RegisterContextTest.cpp + CoreFileMemoryRangesTest.cpp ${PLATFORM_SOURCES} LINK_LIBS diff --git a/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp b/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp new file mode 100644 index 0000000..6d514b1 --- /dev/null +++ b/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp @@ -0,0 +1,205 @@ +//===-- CoreFileMemoryRangesTests.cpp +//---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#include "lldb/Target/CoreFileMemoryRanges.h" +#include "lldb/lldb-types.h" + +using namespace lldb_private; + +TEST(CoreFileMemoryRangesTest, MapOverlappingRanges) { + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t increment_addr = 0x1000; + const size_t iterations = 10; + for (size_t i = 0; i < iterations; i++) { + const lldb::addr_t start = start_addr + (i * increment_addr); + const lldb::addr_t end = start + increment_addr; + // Arbitrary value + const uint32_t permissions = 0x3; + llvm::AddressRange range(start, end); + const CoreFileMemoryRange core_range = {range, permissions}; + // The range data is Start, Size, While the range is start-end. + CoreFileMemoryRanges::Entry entry = {start, end - start, core_range}; + ranges.Append(entry); + } + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(1, ranges.GetSize()); + const auto range = ranges.GetEntryAtIndex(0); + ASSERT_TRUE(range); + ASSERT_THAT(start_addr, range->GetRangeBase()); + ASSERT_THAT(start_addr + (iterations * increment_addr), range->GetRangeEnd()); +} + +TEST(CoreFileMemoryRangesTest, RangesSplitByPermissions) { + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t increment_addr = 0x1000; + const size_t iterations = 10; + for (size_t i = 0; i < iterations; i++) { + const lldb::addr_t start = start_addr + (i * increment_addr); + const lldb::addr_t end = start + increment_addr; + const uint32_t permissions = i; + llvm::AddressRange range(start, end); + const CoreFileMemoryRange core_range = {range, permissions}; + // The range data is Start, Size, While the range is start-end. + CoreFileMemoryRanges::Entry entry = {start, end - start, core_range}; + ranges.Append(entry); + } + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(10, ranges.GetSize()); + const auto range = ranges.GetEntryAtIndex(0); + ASSERT_TRUE(range); + ASSERT_THAT(start_addr, range->GetRangeBase()); + ASSERT_THAT(start_addr + increment_addr, range->GetRangeEnd()); +} + +TEST(CoreFileMemoryRangesTest, MapPartialOverlappingRanges) { + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t increment_addr = 0x1000; + const size_t iterations = 10; + for (size_t i = 0; i < iterations; i++) { + const lldb::addr_t start = start_addr + (i * increment_addr); + const lldb::addr_t end = start + increment_addr; + // Arbitrary value + const uint32_t permissions = 0x3; + llvm::AddressRange range(start, end); + const CoreFileMemoryRange core_range = {range, permissions}; + // The range data is Start, Size, While the range is start-end. + CoreFileMemoryRanges::Entry entry = {start, end - start, core_range}; + ranges.Append(entry); + } + + const lldb::addr_t unique_start = 0x7fff0000; + const lldb::addr_t unique_end = unique_start + increment_addr; + llvm::AddressRange range(unique_start, unique_end); + const uint32_t permissions = 0x3; + const CoreFileMemoryRange core_range = {range, permissions}; + // The range data is Start, Size, While the range is start-end. + CoreFileMemoryRanges::Entry entry = {unique_start, unique_end - unique_start, + core_range}; + ranges.Append(entry); + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(2, ranges.GetSize()); + const auto merged_range = ranges.GetEntryAtIndex(0); + ASSERT_TRUE(merged_range); + ASSERT_THAT(start_addr, merged_range->GetRangeBase()); + ASSERT_THAT(start_addr + (iterations * increment_addr), + merged_range->GetRangeEnd()); + const auto unique_range = ranges.GetEntryAtIndex(1); + ASSERT_TRUE(unique_range); + ASSERT_THAT(unique_start, unique_range->GetRangeBase()); + ASSERT_THAT(unique_end, unique_range->GetRangeEnd()); +} + +TEST(CoreFileMemoryRangesTest, SuperiorAndInferiorRanges_SamePermissions) { + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t increment_addr = 0x1000; + const lldb::addr_t superior_region_end = start_addr + increment_addr * 10; + llvm::AddressRange range(start_addr, superior_region_end); + const CoreFileMemoryRange core_range = {range, 0x3}; + CoreFileMemoryRanges::Entry entry = { + start_addr, superior_region_end - start_addr, core_range}; + ranges.Append(entry); + const lldb::addr_t inferior_region_end = start_addr + increment_addr; + llvm::AddressRange inferior_range(start_addr, inferior_region_end); + const CoreFileMemoryRange inferior_core_range = {inferior_range, 0x3}; + CoreFileMemoryRanges::Entry inferior_entry = { + start_addr, inferior_region_end - start_addr, inferior_core_range}; + ranges.Append(inferior_entry); + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(1, ranges.GetSize()); + const auto searched_range = ranges.GetEntryAtIndex(0); + ASSERT_TRUE(searched_range); + ASSERT_THAT(start_addr, searched_range->GetRangeBase()); + ASSERT_THAT(superior_region_end, searched_range->GetRangeEnd()); +} + +TEST(CoreFileMemoryRangesTest, SuperiorAndInferiorRanges_DifferentPermissions) { + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t increment_addr = 0x1000; + const lldb::addr_t superior_region_end = start_addr + increment_addr * 10; + llvm::AddressRange range(start_addr, superior_region_end); + const CoreFileMemoryRange core_range = {range, 0x3}; + CoreFileMemoryRanges::Entry entry = { + start_addr, superior_region_end - start_addr, core_range}; + ranges.Append(entry); + const lldb::addr_t inferior_region_end = start_addr + increment_addr; + llvm::AddressRange inferior_range(start_addr, inferior_region_end); + const CoreFileMemoryRange inferior_core_range = {inferior_range, 0x4}; + CoreFileMemoryRanges::Entry inferior_entry = { + start_addr, inferior_region_end - start_addr, inferior_core_range}; + ranges.Append(inferior_entry); + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Fail()); +} + +TEST(CoreFileMemoryRangesTest, NonIntersectingRangesSamePermissions) { + const int permissions = 0x7; + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t region_one_start = 0x1000; + const lldb::addr_t region_one_end = 0x2000; + llvm::AddressRange range_one(region_one_start, region_one_end); + const CoreFileMemoryRange core_range_one = {range_one, permissions}; + CoreFileMemoryRanges::Entry entry_one = { + region_one_start, region_one_end - region_one_start, core_range_one}; + ranges.Append(entry_one); + const lldb::addr_t region_two_start = 0xb000; + const lldb::addr_t region_two_end = 0xc000; + llvm::AddressRange range_two(region_two_start, region_two_end); + const CoreFileMemoryRange core_range_two = {range_two, permissions}; + CoreFileMemoryRanges::Entry entry_two = { + region_two_start, region_two_end - region_two_start, core_range_two}; + ranges.Append(entry_two); + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(2UL, ranges.GetSize()); + ASSERT_THAT(region_one_start, ranges.GetEntryAtIndex(0)->GetRangeBase()); + ASSERT_THAT(region_two_start, ranges.GetEntryAtIndex(1)->GetRangeBase()); +} + +TEST(CoreFileMemoryRangesTest, PartialOverlapping) { + const int permissions = 0x3; + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t end_addr = 0x2000; + llvm::AddressRange range_one(start_addr, end_addr); + const CoreFileMemoryRange core_range_one = {range_one, permissions}; + CoreFileMemoryRanges::Entry entry_one = {start_addr, end_addr - start_addr, + core_range_one}; + llvm::AddressRange range_two(start_addr / 2, end_addr / 2); + const CoreFileMemoryRange core_range_two = {range_two, permissions}; + CoreFileMemoryRanges::Entry entry_two = { + start_addr / 2, end_addr / 2 - start_addr / 2, core_range_two}; + ranges.Append(entry_one); + ranges.Append(entry_two); + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(1, ranges.GetSize()); + const auto searched_range = ranges.GetEntryAtIndex(0); + ASSERT_TRUE(searched_range); + ASSERT_THAT(start_addr / 2, searched_range->GetRangeBase()); + ASSERT_THAT(end_addr, searched_range->GetRangeEnd()); +} diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp index c646ba6..ca2cab6 100644 --- a/llvm/benchmarks/SandboxIRBench.cpp +++ b/llvm/benchmarks/SandboxIRBench.cpp @@ -34,15 +34,19 @@ static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) { } enum class IR { - LLVM, - SBox, + LLVM, ///> LLVM IR + SBoxNoTracking, ///> Sandbox IR with tracking disabled + SBoxTracking, ///> Sandbox IR with tracking enabled }; // Traits to get llvm::BasicBlock/sandboxir::BasicBlock from IR::LLVM/IR::SBox. template <IR IRTy> struct TypeSelect {}; template <> struct TypeSelect<IR::LLVM> { using BasicBlock = llvm::BasicBlock; }; -template <> struct TypeSelect<IR::SBox> { +template <> struct TypeSelect<IR::SBoxNoTracking> { + using BasicBlock = sandboxir::BasicBlock; +}; +template <> struct TypeSelect<IR::SBoxTracking> { using BasicBlock = sandboxir::BasicBlock; }; @@ -59,12 +63,22 @@ genIR(std::unique_ptr<llvm::Module> &LLVMM, LLVMContext &LLVMCtx, sandboxir::Function *F = Ctx.createFunction(LLVMF); sandboxir::BasicBlock *BB = &*F->begin(); + // Start tracking if we are testing with tracking enabled. + if constexpr (IRTy == IR::SBoxTracking) + Ctx.save(); + if constexpr (IRTy == IR::LLVM) return LLVMBB; else return BB; } +template <IR IRTy> static void finalize(sandboxir::Context &Ctx) { + // Accept changes if we are tracking. + if constexpr (IRTy == IR::SBoxTracking) + Ctx.accept(); +} + static std::string generateBBWalkIR(unsigned Size) { std::stringstream SS; SS << "define void @foo(i32 %v1, i32 %v2) {\n"; @@ -132,15 +146,61 @@ template <IR IRTy> static void RAUW(benchmark::State &State) { Def1->replaceAllUsesWith(Def2); Def2->replaceAllUsesWith(Def1); } + finalize<IRTy>(Ctx); +} + +static std::string generateRUOWIR(unsigned NumOperands) { + std::stringstream SS; + auto GenOps = [&SS, NumOperands]() { + for (auto Cnt : seq<unsigned>(0, NumOperands)) { + SS << "i8 %arg" << Cnt; + bool IsLast = Cnt + 1 == NumOperands; + if (!IsLast) + SS << ", "; + } + }; + + SS << "define void @foo("; + GenOps(); + SS << ") {\n"; + + SS << " call void @foo("; + GenOps(); + SS << ")\n"; + SS << "ret void"; + SS << "}"; + return SS.str(); +} + +template <IR IRTy> static void RUOW(benchmark::State &State) { + LLVMContext LLVMCtx; + sandboxir::Context Ctx(LLVMCtx); + std::unique_ptr<llvm::Module> LLVMM; + unsigned NumOperands = State.range(0); + auto *BB = genIR<IRTy>(LLVMM, LLVMCtx, Ctx, generateRUOWIR, NumOperands); + + auto It = BB->begin(); + auto *F = BB->getParent(); + auto *Arg0 = F->getArg(0); + auto *Arg1 = F->getArg(1); + auto *Call = &*It++; + for (auto _ : State) + Call->replaceUsesOfWith(Arg0, Arg1); + finalize<IRTy>(Ctx); } BENCHMARK(GetType<IR::LLVM>); -BENCHMARK(GetType<IR::SBox>); +BENCHMARK(GetType<IR::SBoxNoTracking>); BENCHMARK(BBWalk<IR::LLVM>)->Args({1024}); -BENCHMARK(BBWalk<IR::SBox>)->Args({1024}); +BENCHMARK(BBWalk<IR::SBoxTracking>)->Args({1024}); BENCHMARK(RAUW<IR::LLVM>)->Args({512}); -BENCHMARK(RAUW<IR::SBox>)->Args({512}); +BENCHMARK(RAUW<IR::SBoxNoTracking>)->Args({512}); +BENCHMARK(RAUW<IR::SBoxTracking>)->Args({512}); + +BENCHMARK(RUOW<IR::LLVM>)->Args({4096}); +BENCHMARK(RUOW<IR::SBoxNoTracking>)->Args({4096}); +BENCHMARK(RUOW<IR::SBoxTracking>)->Args({4096}); BENCHMARK_MAIN(); diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h index 00290c9..083d5c9 100644 --- a/llvm/include/llvm/ADT/DenseMap.h +++ b/llvm/include/llvm/ADT/DenseMap.h @@ -109,7 +109,8 @@ public: void clear() { incrementEpoch(); - if (getNumEntries() == 0 && getNumTombstones() == 0) return; + if (getNumEntries() == 0 && getNumTombstones() == 0) + return; // If the capacity of the array is huge, and the # elements used is small, // shrink the array. @@ -119,7 +120,7 @@ public: } const KeyT EmptyKey = getEmptyKey(); - if (std::is_trivially_destructible<ValueT>::value) { + if constexpr (std::is_trivially_destructible_v<ValueT>) { // Use a simpler loop when values don't need destruction. for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) P->getFirst() = EmptyKey; @@ -172,15 +173,14 @@ public: /// The DenseMapInfo is responsible for supplying methods /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key /// type used. - template<class LookupKeyT> - iterator find_as(const LookupKeyT &Val) { + template <class LookupKeyT> iterator find_as(const LookupKeyT &Val) { if (BucketT *Bucket = doFind(Val)) return makeIterator( Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(), *this, true); return end(); } - template<class LookupKeyT> + template <class LookupKeyT> const_iterator find_as(const LookupKeyT &Val) const { if (const BucketT *Bucket = doFind(Val)) return makeConstIterator( @@ -223,7 +223,7 @@ public: // The value is constructed in-place if the key is not in the map, otherwise // it is not moved. template <typename... Ts> - std::pair<iterator, bool> try_emplace(KeyT &&Key, Ts &&... Args) { + std::pair<iterator, bool> try_emplace(KeyT &&Key, Ts &&...Args) { BucketT *TheBucket; if (LookupBucketFor(Key, TheBucket)) return std::make_pair(makeIterator(TheBucket, @@ -248,7 +248,7 @@ public: // The value is constructed in-place if the key is not in the map, otherwise // it is not moved. template <typename... Ts> - std::pair<iterator, bool> try_emplace(const KeyT &Key, Ts &&... Args) { + std::pair<iterator, bool> try_emplace(const KeyT &Key, Ts &&...Args) { BucketT *TheBucket; if (LookupBucketFor(Key, TheBucket)) return std::make_pair(makeIterator(TheBucket, @@ -297,8 +297,7 @@ public: } /// insert - Range insertion of pairs. - template<typename InputIt> - void insert(InputIt I, InputIt E) { + template <typename InputIt> void insert(InputIt I, InputIt E) { for (; I != E; ++I) insert(*I); } @@ -420,7 +419,7 @@ protected: setNumEntries(0); setNumTombstones(0); - assert((getNumBuckets() & (getNumBuckets()-1)) == 0 && + assert((getNumBuckets() & (getNumBuckets() - 1)) == 0 && "# initial buckets must be a power of two!"); const KeyT EmptyKey = getEmptyKey(); for (BucketT *B = getBuckets(), *E = getBucketsEnd(); B != E; ++B) @@ -491,7 +490,7 @@ protected: return KeyInfoT::getHashValue(Val); } - template<typename LookupKeyT> + template <typename LookupKeyT> static unsigned getHashValue(const LookupKeyT &Val) { return KeyInfoT::getHashValue(Val); } @@ -502,14 +501,11 @@ protected: return KeyInfoT::getEmptyKey(); } - static const KeyT getTombstoneKey() { - return KeyInfoT::getTombstoneKey(); - } + static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); } private: - iterator makeIterator(BucketT *P, BucketT *E, - DebugEpochBase &Epoch, - bool NoAdvance=false) { + iterator makeIterator(BucketT *P, BucketT *E, DebugEpochBase &Epoch, + bool NoAdvance = false) { if (shouldReverseIterate<KeyT>()) { BucketT *B = P == getBucketsEnd() ? getBuckets() : P + 1; return iterator(B, E, Epoch, NoAdvance); @@ -519,7 +515,7 @@ private: const_iterator makeConstIterator(const BucketT *P, const BucketT *E, const DebugEpochBase &Epoch, - const bool NoAdvance=false) const { + const bool NoAdvance = false) const { if (shouldReverseIterate<KeyT>()) { const BucketT *B = P == getBucketsEnd() ? getBuckets() : P + 1; return const_iterator(B, E, Epoch, NoAdvance); @@ -535,13 +531,9 @@ private: static_cast<DerivedT *>(this)->setNumEntries(Num); } - void incrementNumEntries() { - setNumEntries(getNumEntries() + 1); - } + void incrementNumEntries() { setNumEntries(getNumEntries() + 1); } - void decrementNumEntries() { - setNumEntries(getNumEntries() - 1); - } + void decrementNumEntries() { setNumEntries(getNumEntries() - 1); } unsigned getNumTombstones() const { return static_cast<const DerivedT *>(this)->getNumTombstones(); @@ -551,45 +543,33 @@ private: static_cast<DerivedT *>(this)->setNumTombstones(Num); } - void incrementNumTombstones() { - setNumTombstones(getNumTombstones() + 1); - } + void incrementNumTombstones() { setNumTombstones(getNumTombstones() + 1); } - void decrementNumTombstones() { - setNumTombstones(getNumTombstones() - 1); - } + void decrementNumTombstones() { setNumTombstones(getNumTombstones() - 1); } const BucketT *getBuckets() const { return static_cast<const DerivedT *>(this)->getBuckets(); } - BucketT *getBuckets() { - return static_cast<DerivedT *>(this)->getBuckets(); - } + BucketT *getBuckets() { return static_cast<DerivedT *>(this)->getBuckets(); } unsigned getNumBuckets() const { return static_cast<const DerivedT *>(this)->getNumBuckets(); } - BucketT *getBucketsEnd() { - return getBuckets() + getNumBuckets(); - } + BucketT *getBucketsEnd() { return getBuckets() + getNumBuckets(); } const BucketT *getBucketsEnd() const { return getBuckets() + getNumBuckets(); } - void grow(unsigned AtLeast) { - static_cast<DerivedT *>(this)->grow(AtLeast); - } + void grow(unsigned AtLeast) { static_cast<DerivedT *>(this)->grow(AtLeast); } - void shrink_and_clear() { - static_cast<DerivedT *>(this)->shrink_and_clear(); - } + void shrink_and_clear() { static_cast<DerivedT *>(this)->shrink_and_clear(); } template <typename KeyArg, typename... ValueArgs> BucketT *InsertIntoBucket(BucketT *TheBucket, KeyArg &&Key, - ValueArgs &&... Values) { + ValueArgs &&...Values) { TheBucket = InsertIntoBucketImpl(Key, Key, TheBucket); TheBucket->getFirst() = std::forward<KeyArg>(Key); @@ -627,8 +607,9 @@ private: this->grow(NumBuckets * 2); LookupBucketFor(Lookup, TheBucket); NumBuckets = getNumBuckets(); - } else if (LLVM_UNLIKELY(NumBuckets-(NewNumEntries+getNumTombstones()) <= - NumBuckets/8)) { + } else if (LLVM_UNLIKELY(NumBuckets - + (NewNumEntries + getNumTombstones()) <= + NumBuckets / 8)) { this->grow(NumBuckets); LookupBucketFor(Lookup, TheBucket); } @@ -696,7 +677,7 @@ private: !KeyInfoT::isEqual(Val, TombstoneKey) && "Empty/Tombstone value shouldn't be inserted into map!"); - unsigned BucketNo = getHashValue(Val) & (NumBuckets-1); + unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1); unsigned ProbeAmt = 1; while (true) { BucketT *ThisBucket = BucketsPtr + BucketNo; @@ -719,12 +700,12 @@ private: // prefer to return it than something that would require more probing. if (KeyInfoT::isEqual(ThisBucket->getFirst(), TombstoneKey) && !FoundTombstone) - FoundTombstone = ThisBucket; // Remember the first tombstone found. + FoundTombstone = ThisBucket; // Remember the first tombstone found. // Otherwise, it's a hash collision or a tombstone, continue quadratic // probing. BucketNo += ProbeAmt++; - BucketNo &= (NumBuckets-1); + BucketNo &= (NumBuckets - 1); } } @@ -733,9 +714,7 @@ public: /// This is just the raw memory used by DenseMap. /// If entries are pointers to objects, the size of the referenced objects /// are not included. - size_t getMemorySize() const { - return getNumBuckets() * sizeof(BucketT); - } + size_t getMemorySize() const { return getNumBuckets() * sizeof(BucketT); } }; /// Equality comparison for DenseMap. @@ -803,8 +782,7 @@ public: swap(other); } - template<typename InputIt> - DenseMap(const InputIt &I, const InputIt &E) { + template <typename InputIt> DenseMap(const InputIt &I, const InputIt &E) { init(std::distance(I, E)); this->insert(I, E); } @@ -819,7 +797,7 @@ public: deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT)); } - void swap(DenseMap& RHS) { + void swap(DenseMap &RHS) { this->incrementEpoch(); RHS.incrementEpoch(); std::swap(Buckets, RHS.Buckets); @@ -828,13 +806,13 @@ public: std::swap(NumBuckets, RHS.NumBuckets); } - DenseMap& operator=(const DenseMap& other) { + DenseMap &operator=(const DenseMap &other) { if (&other != this) copyFrom(other); return *this; } - DenseMap& operator=(DenseMap &&other) { + DenseMap &operator=(DenseMap &&other) { this->destroyAll(); deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT)); init(0); @@ -842,7 +820,7 @@ public: return *this; } - void copyFrom(const DenseMap& other) { + void copyFrom(const DenseMap &other) { this->destroyAll(); deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT)); if (allocateBuckets(other.NumBuckets)) { @@ -867,14 +845,15 @@ public: unsigned OldNumBuckets = NumBuckets; BucketT *OldBuckets = Buckets; - allocateBuckets(std::max<unsigned>(64, static_cast<unsigned>(NextPowerOf2(AtLeast-1)))); + allocateBuckets(std::max<unsigned>( + 64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1)))); assert(Buckets); if (!OldBuckets) { this->BaseT::initEmpty(); return; } - this->moveFromOldBuckets(OldBuckets, OldBuckets+OldNumBuckets); + this->moveFromOldBuckets(OldBuckets, OldBuckets + OldNumBuckets); // Free the old table. deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets, @@ -901,29 +880,17 @@ public: } private: - unsigned getNumEntries() const { - return NumEntries; - } + unsigned getNumEntries() const { return NumEntries; } - void setNumEntries(unsigned Num) { - NumEntries = Num; - } + void setNumEntries(unsigned Num) { NumEntries = Num; } - unsigned getNumTombstones() const { - return NumTombstones; - } + unsigned getNumTombstones() const { return NumTombstones; } - void setNumTombstones(unsigned Num) { - NumTombstones = Num; - } + void setNumTombstones(unsigned Num) { NumTombstones = Num; } - BucketT *getBuckets() const { - return Buckets; - } + BucketT *getBuckets() const { return Buckets; } - unsigned getNumBuckets() const { - return NumBuckets; - } + unsigned getNumBuckets() const { return NumBuckets; } bool allocateBuckets(unsigned Num) { NumBuckets = Num; @@ -984,7 +951,7 @@ public: swap(other); } - template<typename InputIt> + template <typename InputIt> SmallDenseMap(const InputIt &I, const InputIt &E) { init(NextPowerOf2(std::distance(I, E))); this->insert(I, E); @@ -998,7 +965,7 @@ public: deallocateBuckets(); } - void swap(SmallDenseMap& RHS) { + void swap(SmallDenseMap &RHS) { unsigned TmpNumEntries = RHS.NumEntries; RHS.NumEntries = NumEntries; NumEntries = TmpNumEntries; @@ -1070,13 +1037,13 @@ public: new (SmallSide.getLargeRep()) LargeRep(std::move(TmpRep)); } - SmallDenseMap& operator=(const SmallDenseMap& other) { + SmallDenseMap &operator=(const SmallDenseMap &other) { if (&other != this) copyFrom(other); return *this; } - SmallDenseMap& operator=(SmallDenseMap &&other) { + SmallDenseMap &operator=(SmallDenseMap &&other) { this->destroyAll(); deallocateBuckets(); init(0); @@ -1084,7 +1051,7 @@ public: return *this; } - void copyFrom(const SmallDenseMap& other) { + void copyFrom(const SmallDenseMap &other) { this->destroyAll(); deallocateBuckets(); Small = true; @@ -1106,7 +1073,7 @@ public: void grow(unsigned AtLeast) { if (AtLeast > InlineBuckets) - AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast-1)); + AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1)); if (Small) { // First move the inline buckets into a temporary storage. @@ -1150,7 +1117,8 @@ public: new (getLargeRep()) LargeRep(allocateBuckets(AtLeast)); } - this->moveFromOldBuckets(OldRep.Buckets, OldRep.Buckets+OldRep.NumBuckets); + this->moveFromOldBuckets(OldRep.Buckets, + OldRep.Buckets + OldRep.NumBuckets); // Free the old table. deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets, @@ -1179,9 +1147,7 @@ public: } private: - unsigned getNumEntries() const { - return NumEntries; - } + unsigned getNumEntries() const { return NumEntries; } void setNumEntries(unsigned Num) { // NumEntries is hardcoded to be 31 bits wide. @@ -1189,13 +1155,9 @@ private: NumEntries = Num; } - unsigned getNumTombstones() const { - return NumTombstones; - } + unsigned getNumTombstones() const { return NumTombstones; } - void setNumTombstones(unsigned Num) { - NumTombstones = Num; - } + void setNumTombstones(unsigned Num) { NumTombstones = Num; } const BucketT *getInlineBuckets() const { assert(Small); @@ -1207,7 +1169,7 @@ private: BucketT *getInlineBuckets() { return const_cast<BucketT *>( - const_cast<const SmallDenseMap *>(this)->getInlineBuckets()); + const_cast<const SmallDenseMap *>(this)->getInlineBuckets()); } const LargeRep *getLargeRep() const { @@ -1218,7 +1180,7 @@ private: LargeRep *getLargeRep() { return const_cast<LargeRep *>( - const_cast<const SmallDenseMap *>(this)->getLargeRep()); + const_cast<const SmallDenseMap *>(this)->getLargeRep()); } const BucketT *getBuckets() const { @@ -1227,7 +1189,7 @@ private: BucketT *getBuckets() { return const_cast<BucketT *>( - const_cast<const SmallDenseMap *>(this)->getBuckets()); + const_cast<const SmallDenseMap *>(this)->getBuckets()); } unsigned getNumBuckets() const { @@ -1278,7 +1240,8 @@ public: : DebugEpochBase::HandleBase(&Epoch), Ptr(Pos), End(E) { assert(isHandleInSync() && "invalid construction!"); - if (NoAdvance) return; + if (NoAdvance) + return; if (shouldReverseIterate<KeyT>()) { RetreatPastEmptyBuckets(); return; @@ -1324,7 +1287,7 @@ public: return !(LHS == RHS); } - inline DenseMapIterator& operator++() { // Preincrement + inline DenseMapIterator &operator++() { // Preincrement assert(isHandleInSync() && "invalid iterator access!"); assert(Ptr != End && "incrementing end() iterator"); if (shouldReverseIterate<KeyT>()) { @@ -1336,9 +1299,11 @@ public: AdvancePastEmptyBuckets(); return *this; } - DenseMapIterator operator++(int) { // Postincrement + DenseMapIterator operator++(int) { // Postincrement assert(isHandleInSync() && "invalid iterator access!"); - DenseMapIterator tmp = *this; ++*this; return tmp; + DenseMapIterator tmp = *this; + ++*this; + return tmp; } private: diff --git a/llvm/include/llvm/ADT/DenseSet.h b/llvm/include/llvm/ADT/DenseSet.h index b89c886..a307bd8 100644 --- a/llvm/include/llvm/ADT/DenseSet.h +++ b/llvm/include/llvm/ADT/DenseSet.h @@ -89,18 +89,12 @@ public: /// before resizing again. void reserve(size_t Size) { TheMap.reserve(Size); } - void clear() { - TheMap.clear(); - } + void clear() { TheMap.clear(); } /// Return 1 if the specified key is in the set, 0 otherwise. - size_type count(const_arg_type_t<ValueT> V) const { - return TheMap.count(V); - } + size_type count(const_arg_type_t<ValueT> V) const { return TheMap.count(V); } - bool erase(const ValueT &V) { - return TheMap.erase(V); - } + bool erase(const ValueT &V) { return TheMap.erase(V); } void swap(DenseSetImpl &RHS) { TheMap.swap(RHS.TheMap); } @@ -128,8 +122,15 @@ public: ValueT *operator->() { return &I->getFirst(); } const ValueT *operator->() const { return &I->getFirst(); } - Iterator& operator++() { ++I; return *this; } - Iterator operator++(int) { auto T = *this; ++I; return T; } + Iterator &operator++() { + ++I; + return *this; + } + Iterator operator++(int) { + auto T = *this; + ++I; + return T; + } friend bool operator==(const Iterator &X, const Iterator &Y) { return X.I == Y.I; } @@ -157,8 +158,15 @@ public: const ValueT &operator*() const { return I->getFirst(); } const ValueT *operator->() const { return &I->getFirst(); } - ConstIterator& operator++() { ++I; return *this; } - ConstIterator operator++(int) { auto T = *this; ++I; return T; } + ConstIterator &operator++() { + ++I; + return *this; + } + ConstIterator operator++(int) { + auto T = *this; + ++I; + return T; + } friend bool operator==(const ConstIterator &X, const ConstIterator &Y) { return X.I == Y.I; } @@ -191,8 +199,7 @@ public: /// The DenseMapInfo is responsible for supplying methods /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key type /// used. - template <class LookupKeyT> - iterator find_as(const LookupKeyT &Val) { + template <class LookupKeyT> iterator find_as(const LookupKeyT &Val) { return Iterator(TheMap.find_as(Val)); } template <class LookupKeyT> @@ -226,8 +233,7 @@ public: } // Range insertion of values. - template<typename InputIt> - void insert(InputIt I, InputIt E) { + template <typename InputIt> void insert(InputIt I, InputIt E) { for (; I != E; ++I) insert(*I); } @@ -266,8 +272,9 @@ bool operator!=(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS, /// Implements a dense probed hash-table based set. template <typename ValueT, typename ValueInfoT = DenseMapInfo<ValueT>> class DenseSet : public detail::DenseSetImpl< - ValueT, DenseMap<ValueT, detail::DenseSetEmpty, ValueInfoT, - detail::DenseSetPair<ValueT>>, + ValueT, + DenseMap<ValueT, detail::DenseSetEmpty, ValueInfoT, + detail::DenseSetPair<ValueT>>, ValueInfoT> { using BaseT = detail::DenseSetImpl<ValueT, @@ -285,12 +292,14 @@ template <typename ValueT, unsigned InlineBuckets = 4, typename ValueInfoT = DenseMapInfo<ValueT>> class SmallDenseSet : public detail::DenseSetImpl< - ValueT, SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets, - ValueInfoT, detail::DenseSetPair<ValueT>>, + ValueT, + SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets, + ValueInfoT, detail::DenseSetPair<ValueT>>, ValueInfoT> { using BaseT = detail::DenseSetImpl< - ValueT, SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets, - ValueInfoT, detail::DenseSetPair<ValueT>>, + ValueT, + SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets, ValueInfoT, + detail::DenseSetPair<ValueT>>, ValueInfoT>; public: diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h index c5eff15..09fc34a 100644 --- a/llvm/include/llvm/Analysis/MemorySSA.h +++ b/llvm/include/llvm/Analysis/MemorySSA.h @@ -218,8 +218,8 @@ protected: inline unsigned getID() const; MemoryAccess(LLVMContext &C, unsigned Vty, DeleteValueTy DeleteValue, - BasicBlock *BB, unsigned NumOperands) - : DerivedUser(Type::getVoidTy(C), Vty, nullptr, NumOperands, DeleteValue), + BasicBlock *BB, AllocInfo AllocInfo) + : DerivedUser(Type::getVoidTy(C), Vty, AllocInfo, DeleteValue), Block(BB) {} // Use deleteValue() to delete a generic MemoryAccess. @@ -280,8 +280,8 @@ protected: MemoryUseOrDef(LLVMContext &C, MemoryAccess *DMA, unsigned Vty, DeleteValueTy DeleteValue, Instruction *MI, BasicBlock *BB, - unsigned NumOperands) - : MemoryAccess(C, Vty, DeleteValue, BB, NumOperands), + AllocInfo AllocInfo) + : MemoryAccess(C, Vty, DeleteValue, BB, AllocInfo), MemoryInstruction(MI) { setDefiningAccess(DMA); } @@ -307,15 +307,16 @@ private: /// MemoryUse's is exactly the set of Instructions for which /// AliasAnalysis::getModRefInfo returns "Ref". class MemoryUse final : public MemoryUseOrDef { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + public: DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess); MemoryUse(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB) - : MemoryUseOrDef(C, DMA, MemoryUseVal, deleteMe, MI, BB, - /*NumOperands=*/1) {} + : MemoryUseOrDef(C, DMA, MemoryUseVal, deleteMe, MI, BB, AllocMarker) {} // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } static bool classof(const Value *MA) { @@ -367,6 +368,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUse, MemoryAccess) /// associated with them. This use points to the nearest reaching /// MemoryDef/MemoryPhi. class MemoryDef final : public MemoryUseOrDef { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: friend class MemorySSA; @@ -374,12 +377,11 @@ public: MemoryDef(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB, unsigned Ver) - : MemoryUseOrDef(C, DMA, MemoryDefVal, deleteMe, MI, BB, - /*NumOperands=*/2), + : MemoryUseOrDef(C, DMA, MemoryDefVal, deleteMe, MI, BB, AllocMarker), ID(Ver) {} // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } static bool classof(const Value *MA) { @@ -474,8 +476,10 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUseOrDef, MemoryAccess) /// Because MemoryUse's do not generate new definitions, they do not have this /// issue. class MemoryPhi final : public MemoryAccess { + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } public: void operator delete(void *Ptr) { User::operator delete(Ptr); } @@ -484,7 +488,7 @@ public: DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess); MemoryPhi(LLVMContext &C, BasicBlock *BB, unsigned Ver, unsigned NumPreds = 0) - : MemoryAccess(C, MemoryPhiVal, deleteMe, BB, 0), ID(Ver), + : MemoryAccess(C, MemoryPhiVal, deleteMe, BB, AllocMarker), ID(Ver), ReservedSpace(NumPreds) { allocHungoffUses(ReservedSpace); } diff --git a/llvm/include/llvm/IR/Constant.h b/llvm/include/llvm/IR/Constant.h index a82e37b..0aefb5e 100644 --- a/llvm/include/llvm/IR/Constant.h +++ b/llvm/include/llvm/IR/Constant.h @@ -41,8 +41,8 @@ class APInt; /// LLVM Constant Representation class Constant : public User { protected: - Constant(Type *ty, ValueTy vty, Use *Ops, unsigned NumOps) - : User(ty, vty, Ops, NumOps) {} + Constant(Type *ty, ValueTy vty, AllocInfo AllocInfo) + : User(ty, vty, AllocInfo) {} ~Constant() = default; diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h index 62ccde9..3b16aa0 100644 --- a/llvm/include/llvm/IR/Constants.h +++ b/llvm/include/llvm/IR/Constants.h @@ -51,6 +51,8 @@ template <class ConstantClass> struct ConstantAggrKeyType; /// Since they can be in use by unrelated modules (and are never based on /// GlobalValues), it never makes sense to RAUW them. class ConstantData : public Constant { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{0}; + friend class Constant; Value *handleOperandChangeImpl(Value *From, Value *To) { @@ -58,9 +60,9 @@ class ConstantData : public Constant { } protected: - explicit ConstantData(Type *Ty, ValueTy VT) : Constant(Ty, VT, nullptr, 0) {} + explicit ConstantData(Type *Ty, ValueTy VT) : Constant(Ty, VT, AllocMarker) {} - void *operator new(size_t S) { return User::operator new(S, 0); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } public: void operator delete(void *Ptr) { User::operator delete(Ptr); } @@ -399,7 +401,8 @@ public: /// use operands. class ConstantAggregate : public Constant { protected: - ConstantAggregate(Type *T, ValueTy VT, ArrayRef<Constant *> V); + ConstantAggregate(Type *T, ValueTy VT, ArrayRef<Constant *> V, + AllocInfo AllocInfo); public: /// Transparently provide more efficient getOperand methods. @@ -425,7 +428,7 @@ class ConstantArray final : public ConstantAggregate { friend struct ConstantAggrKeyType<ConstantArray>; friend class Constant; - ConstantArray(ArrayType *T, ArrayRef<Constant *> Val); + ConstantArray(ArrayType *T, ArrayRef<Constant *> Val, AllocInfo AllocInfo); void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -457,7 +460,7 @@ class ConstantStruct final : public ConstantAggregate { friend struct ConstantAggrKeyType<ConstantStruct>; friend class Constant; - ConstantStruct(StructType *T, ArrayRef<Constant *> Val); + ConstantStruct(StructType *T, ArrayRef<Constant *> Val, AllocInfo AllocInfo); void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -509,7 +512,7 @@ class ConstantVector final : public ConstantAggregate { friend struct ConstantAggrKeyType<ConstantVector>; friend class Constant; - ConstantVector(VectorType *T, ArrayRef<Constant *> Val); + ConstantVector(VectorType *T, ArrayRef<Constant *> Val, AllocInfo AllocInfo); void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -890,9 +893,11 @@ public: class BlockAddress final : public Constant { friend class Constant; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + BlockAddress(Function *F, BasicBlock *BB); - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -936,9 +941,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BlockAddress, Value) class DSOLocalEquivalent final : public Constant { friend class Constant; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + DSOLocalEquivalent(GlobalValue *GV); - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -973,9 +980,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DSOLocalEquivalent, Value) class NoCFIValue final : public Constant { friend class Constant; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + NoCFIValue(GlobalValue *GV); - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -1013,10 +1022,12 @@ class ConstantPtrAuth final : public Constant { friend struct ConstantPtrAuthKeyType; friend class Constant; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{4}; + ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc, Constant *AddrDisc); - void *operator new(size_t s) { return User::operator new(s, 4); } + void *operator new(size_t s) { return User::operator new(s, AllocMarker); } void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -1102,8 +1113,8 @@ class ConstantExpr : public Constant { Value *handleOperandChangeImpl(Value *From, Value *To); protected: - ConstantExpr(Type *ty, unsigned Opcode, Use *Ops, unsigned NumOps) - : Constant(ty, ConstantExprVal, Ops, NumOps) { + ConstantExpr(Type *ty, unsigned Opcode, AllocInfo AllocInfo) + : Constant(ty, ConstantExprVal, AllocInfo) { // Operation type (an Instruction opcode) is stored as the SubclassData. setValueSubclassData(Opcode); } diff --git a/llvm/include/llvm/IR/DerivedUser.h b/llvm/include/llvm/IR/DerivedUser.h index a25d316..a307315 100644 --- a/llvm/include/llvm/IR/DerivedUser.h +++ b/llvm/include/llvm/IR/DerivedUser.h @@ -34,9 +34,9 @@ private: DeleteValueTy DeleteValue; public: - DerivedUser(Type *Ty, unsigned VK, Use *U, unsigned NumOps, + DerivedUser(Type *Ty, unsigned VK, AllocInfo AllocInfo, DeleteValueTy DeleteValue) - : User(Ty, VK, U, NumOps), DeleteValue(DeleteValue) {} + : User(Ty, VK, AllocInfo), DeleteValue(DeleteValue) {} }; } // end namespace llvm diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index f7e4e97..866c68d 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -72,6 +72,8 @@ public: using const_arg_iterator = const Argument *; private: + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + // Important things that make up a function! BasicBlockListType BasicBlocks; ///< The basic blocks @@ -171,13 +173,14 @@ public: static Function *Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N = "", Module *M = nullptr) { - return new Function(Ty, Linkage, AddrSpace, N, M); + return new (AllocMarker) Function(Ty, Linkage, AddrSpace, N, M); } // TODO: remove this once all users have been updated to pass an AddrSpace static Function *Create(FunctionType *Ty, LinkageTypes Linkage, const Twine &N = "", Module *M = nullptr) { - return new Function(Ty, Linkage, static_cast<unsigned>(-1), N, M); + return new (AllocMarker) + Function(Ty, Linkage, static_cast<unsigned>(-1), N, M); } /// Creates a new function and attaches it to a module. diff --git a/llvm/include/llvm/IR/GlobalAlias.h b/llvm/include/llvm/IR/GlobalAlias.h index 583d66e..3db6984 100644 --- a/llvm/include/llvm/IR/GlobalAlias.h +++ b/llvm/include/llvm/IR/GlobalAlias.h @@ -28,6 +28,8 @@ template <typename ValueSubClass, typename... Args> class SymbolTableListTraits; class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> { friend class SymbolTableListTraits<GlobalAlias>; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent); @@ -59,7 +61,7 @@ public: static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee); // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Provide fast operand accessors diff --git a/llvm/include/llvm/IR/GlobalIFunc.h b/llvm/include/llvm/IR/GlobalIFunc.h index 8935284..0d2f152 100644 --- a/llvm/include/llvm/IR/GlobalIFunc.h +++ b/llvm/include/llvm/IR/GlobalIFunc.h @@ -34,6 +34,8 @@ template <typename ValueSubClass, typename... Args> class SymbolTableListTraits; class GlobalIFunc final : public GlobalObject, public ilist_node<GlobalIFunc> { friend class SymbolTableListTraits<GlobalIFunc>; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Resolver, Module *Parent); @@ -48,7 +50,7 @@ public: Constant *Resolver, Module *Parent); // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Provide fast operand accessors diff --git a/llvm/include/llvm/IR/GlobalObject.h b/llvm/include/llvm/IR/GlobalObject.h index b6a974d..08edc13 100644 --- a/llvm/include/llvm/IR/GlobalObject.h +++ b/llvm/include/llvm/IR/GlobalObject.h @@ -40,10 +40,9 @@ public: }; protected: - GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps, - LinkageTypes Linkage, const Twine &Name, - unsigned AddressSpace = 0) - : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name, AddressSpace) { + GlobalObject(Type *Ty, ValueTy VTy, AllocInfo AllocInfo, LinkageTypes Linkage, + const Twine &Name, unsigned AddressSpace = 0) + : GlobalValue(Ty, VTy, AllocInfo, Linkage, Name, AddressSpace) { setGlobalValueSubClassData(0); } ~GlobalObject(); diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h index 53eddeb..d9104d7 100644 --- a/llvm/include/llvm/IR/GlobalValue.h +++ b/llvm/include/llvm/IR/GlobalValue.h @@ -77,9 +77,9 @@ public: }; protected: - GlobalValue(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps, - LinkageTypes Linkage, const Twine &Name, unsigned AddressSpace) - : Constant(PointerType::get(Ty, AddressSpace), VTy, Ops, NumOps), + GlobalValue(Type *Ty, ValueTy VTy, AllocInfo AllocInfo, LinkageTypes Linkage, + const Twine &Name, unsigned AddressSpace) + : Constant(PointerType::get(Ty, AddressSpace), VTy, AllocInfo), ValueType(Ty), Visibility(DefaultVisibility), UnnamedAddrVal(unsigned(UnnamedAddr::None)), DllStorageClass(DefaultStorageClass), ThreadLocal(NotThreadLocal), diff --git a/llvm/include/llvm/IR/GlobalVariable.h b/llvm/include/llvm/IR/GlobalVariable.h index 0736c300..83e4848 100644 --- a/llvm/include/llvm/IR/GlobalVariable.h +++ b/llvm/include/llvm/IR/GlobalVariable.h @@ -39,6 +39,8 @@ class DIGlobalVariableExpression; class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> { friend class SymbolTableListTraits<GlobalVariable>; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + AttributeSet Attrs; // Is this a global constant? @@ -70,24 +72,31 @@ public: GlobalVariable(const GlobalVariable &) = delete; GlobalVariable &operator=(const GlobalVariable &) = delete; +private: + /// Set the number of operands on a GlobalVariable. + /// + /// GlobalVariable always allocates space for a single operands, but + /// doesn't always use it. + void setGlobalVariableNumOperands(unsigned NumOps) { + assert(NumOps <= 1 && "GlobalVariable can only have 0 or 1 operands"); + NumUserOperands = NumOps; + } + +public: ~GlobalVariable() { dropAllReferences(); + + // Number of operands can be set to 0 after construction and initialization. + // Make sure that number of operands is reset to 1, as this is needed in + // User::operator delete + setGlobalVariableNumOperands(1); } // allocate space for exactly one operand - void *operator new(size_t s) { - return User::operator new(s, 1); - } + void *operator new(size_t s) { return User::operator new(s, AllocMarker); } // delete space for exactly one operand as created in the corresponding new operator - void operator delete(void *ptr){ - assert(ptr != nullptr && "must not be nullptr"); - User *Obj = static_cast<User *>(ptr); - // Number of operands can be set to 0 after construction and initialization. Make sure - // that number of operands is reset to 1, as this is needed in User::operator delete - Obj->setGlobalVariableNumOperands(1); - User::operator delete(Obj); - } + void operator delete(void *ptr) { User::operator delete(ptr); } /// Provide fast operand accessors DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index 5ed3ec4..4720533 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -55,24 +55,26 @@ typedef unsigned ID; //===----------------------------------------------------------------------===// class UnaryInstruction : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + protected: UnaryInstruction(Type *Ty, unsigned iType, Value *V, BasicBlock::iterator IB) - : Instruction(Ty, iType, &Op<0>(), 1, IB) { + : Instruction(Ty, iType, AllocMarker, IB) { Op<0>() = V; } UnaryInstruction(Type *Ty, unsigned iType, Value *V, Instruction *IB = nullptr) - : Instruction(Ty, iType, &Op<0>(), 1, IB) { + : Instruction(Ty, iType, AllocMarker, IB) { Op<0>() = V; } UnaryInstruction(Type *Ty, unsigned iType, Value *V, BasicBlock *IAE) - : Instruction(Ty, iType, &Op<0>(), 1, IAE) { + : Instruction(Ty, iType, AllocMarker, IAE) { Op<0>() = V; } public: // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -186,6 +188,8 @@ public: //===----------------------------------------------------------------------===// class BinaryOperator : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + void AssertOK(); protected: @@ -199,7 +203,7 @@ protected: public: // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -745,6 +749,8 @@ public: /// This class is the base class for the comparison instructions. /// Abstract base class of comparison instructions. class CmpInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: /// This enumeration lists the possible predicates for CmpInst subclasses. /// Values in the range 0-31 are reserved for FCmpInst, while values in the @@ -814,7 +820,7 @@ protected: public: // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Construct a compare instruction, given the opcode, the predicate and @@ -2416,10 +2422,10 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CallBase, Value) //===----------------------------------------------------------------------===// class FuncletPadInst : public Instruction { private: - FuncletPadInst(const FuncletPadInst &CPI); + FuncletPadInst(const FuncletPadInst &CPI, AllocInfo AllocInfo); explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, - ArrayRef<Value *> Args, unsigned Values, + ArrayRef<Value *> Args, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore); void init(Value *ParentPad, ArrayRef<Value *> Args, const Twine &NameStr); diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h index c275723..a12d5d9 100644 --- a/llvm/include/llvm/IR/Instruction.h +++ b/llvm/include/llvm/IR/Instruction.h @@ -1030,7 +1030,7 @@ protected: setValueSubclassData(Storage); } - Instruction(Type *Ty, unsigned iType, Use *Ops, unsigned NumOps, + Instruction(Type *Ty, unsigned iType, AllocInfo AllocInfo, InsertPosition InsertBefore = nullptr); private: diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index ab3321e..e89739a 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -297,6 +297,8 @@ class StoreInst : public Instruction { void AssertOK(); + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + protected: // Note: Instruction needs to be a friend here to call cloneImpl. friend class Instruction; @@ -314,7 +316,7 @@ public: InsertPosition InsertBefore = nullptr); // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Return true if this is a store to a volatile memory location. @@ -420,6 +422,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(StoreInst, Value) class FenceInst : public Instruction { using OrderingField = AtomicOrderingBitfieldElementT<0>; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{0}; + void Init(AtomicOrdering Ordering, SyncScope::ID SSID); protected: @@ -436,7 +440,7 @@ public: InsertPosition InsertBefore = nullptr); // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S, 0); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Returns the ordering constraint of this fence instruction. @@ -502,6 +506,8 @@ class AtomicCmpXchgInst : public Instruction { typename Bitfield::Element<AtomicOrdering, Offset, 3, AtomicOrdering::LAST>; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{3}; + protected: // Note: Instruction needs to be a friend here to call cloneImpl. friend class Instruction; @@ -515,7 +521,7 @@ public: InsertPosition InsertBefore = nullptr); // allocate space for exactly three operands - void *operator new(size_t S) { return User::operator new(S, 3); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } using VolatileField = BoolBitfieldElementT<0>; @@ -774,13 +780,15 @@ private: using BinOpBitfieldElement = typename Bitfield::Element<BinOp, Offset, 5, BinOp::LAST_BINOP>; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val, Align Alignment, AtomicOrdering Ordering, SyncScope::ID SSID, InsertPosition InsertBefore = nullptr); // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } using VolatileField = BoolBitfieldElementT<0>; @@ -924,14 +932,14 @@ class GetElementPtrInst : public Instruction { Type *SourceElementType; Type *ResultElementType; - GetElementPtrInst(const GetElementPtrInst &GEPI); + GetElementPtrInst(const GetElementPtrInst &GEPI, AllocInfo AllocInfo); /// Constructors - Create a getelementptr instruction with a base pointer an /// list of indices. The first and second ctor can optionally insert before an /// existing instruction, the third appends the new instruction to the /// specified BasicBlock. inline GetElementPtrInst(Type *PointeeType, Value *Ptr, - ArrayRef<Value *> IdxList, unsigned Values, + ArrayRef<Value *> IdxList, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore); void init(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &NameStr); @@ -949,8 +957,9 @@ public: InsertPosition InsertBefore = nullptr) { unsigned Values = 1 + unsigned(IdxList.size()); assert(PointeeType && "Must specify element type"); - return new (Values) GetElementPtrInst(PointeeType, Ptr, IdxList, Values, - NameStr, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{Values}; + return new (AllocMarker) GetElementPtrInst( + PointeeType, Ptr, IdxList, AllocMarker, NameStr, InsertBefore); } static GetElementPtrInst *Create(Type *PointeeType, Value *Ptr, @@ -1124,12 +1133,11 @@ struct OperandTraits<GetElementPtrInst> : public VariadicOperandTraits<GetElementPtrInst> {}; GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr, - ArrayRef<Value *> IdxList, unsigned Values, - const Twine &NameStr, + ArrayRef<Value *> IdxList, + AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(getGEPReturnType(Ptr, IdxList), GetElementPtr, - OperandTraits<GetElementPtrInst>::op_end(this) - Values, - Values, InsertBefore), + : Instruction(getGEPReturnType(Ptr, IdxList), GetElementPtr, AllocInfo, + InsertBefore), SourceElementType(PointeeType), ResultElementType(getIndexedType(PointeeType, IdxList)) { init(Ptr, IdxList, NameStr); @@ -1403,26 +1411,29 @@ public: /// hold the calling convention of the call. /// class CallInst : public CallBase { - CallInst(const CallInst &CI); + CallInst(const CallInst &CI, AllocInfo AllocInfo); /// Construct a CallInst from a range of arguments inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr, - InsertPosition InsertBefore); + AllocInfo AllocInfo, InsertPosition InsertBefore); inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args, - const Twine &NameStr, InsertPosition InsertBefore) - : CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore) {} + const Twine &NameStr, AllocInfo AllocInfo, + InsertPosition InsertBefore) + : CallInst(Ty, Func, Args, std::nullopt, NameStr, AllocInfo, + InsertBefore) {} explicit CallInst(FunctionType *Ty, Value *F, const Twine &NameStr, - InsertPosition InsertBefore); + AllocInfo AllocInfo, InsertPosition InsertBefore); void init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr); void init(FunctionType *FTy, Value *Func, const Twine &NameStr); /// Compute the number of operands to allocate. - static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) { + static unsigned ComputeNumOperands(unsigned NumArgs, + unsigned NumBundleInputs = 0) { // We need one operand for the called function, plus the input operand // counts provided. return 1 + NumArgs + NumBundleInputs; @@ -1437,26 +1448,29 @@ protected: public: static CallInst *Create(FunctionType *Ty, Value *F, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - return new (ComputeNumOperands(0)) CallInst(Ty, F, NameStr, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{ComputeNumOperands(0)}; + return new (AllocMarker) + CallInst(Ty, F, NameStr, AllocMarker, InsertBefore); } static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args, const Twine &NameStr, InsertPosition InsertBefore = nullptr) { - return new (ComputeNumOperands(Args.size())) - CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{ComputeNumOperands(Args.size())}; + return new (AllocMarker) CallInst(Ty, Func, Args, std::nullopt, NameStr, + AllocMarker, InsertBefore); } static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles = std::nullopt, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - const int NumOperands = - ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)); - const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + ComputeNumOperands(unsigned(Args.size()), CountBundleInputs(Bundles)), + unsigned(Bundles.size() * sizeof(BundleOpInfo))}; - return new (NumOperands, DescriptorBytes) - CallInst(Ty, Func, Args, Bundles, NameStr, InsertBefore); + return new (AllocMarker) + CallInst(Ty, Func, Args, Bundles, NameStr, AllocMarker, InsertBefore); } static CallInst *Create(FunctionCallee Func, const Twine &NameStr = "", @@ -1561,12 +1575,11 @@ private: CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr, - InsertPosition InsertBefore) - : CallBase(Ty->getReturnType(), Instruction::Call, - OperandTraits<CallBase>::op_end(this) - - (Args.size() + CountBundleInputs(Bundles) + 1), - unsigned(Args.size() + CountBundleInputs(Bundles) + 1), + AllocInfo AllocInfo, InsertPosition InsertBefore) + : CallBase(Ty->getReturnType(), Instruction::Call, AllocInfo, InsertBefore) { + assert(AllocInfo.NumOps == + unsigned(Args.size() + CountBundleInputs(Bundles) + 1)); init(Ty, Func, Args, Bundles, NameStr); } @@ -1577,10 +1590,11 @@ CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args, /// This class represents the LLVM 'select' instruction. /// class SelectInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{3}; SelectInst(Value *C, Value *S1, Value *S2, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(S1->getType(), Instruction::Select, &Op<0>(), 3, + : Instruction(S1->getType(), Instruction::Select, AllocMarker, InsertBefore) { init(C, S1, S2); setName(NameStr); @@ -1604,7 +1618,8 @@ public: const Twine &NameStr = "", InsertPosition InsertBefore = nullptr, Instruction *MDFrom = nullptr) { - SelectInst *Sel = new(3) SelectInst(C, S1, S2, NameStr, InsertBefore); + SelectInst *Sel = + new (AllocMarker) SelectInst(C, S1, S2, NameStr, InsertBefore); if (MDFrom) Sel->copyMetadata(*MDFrom); return Sel; @@ -1693,6 +1708,8 @@ public: /// element from a VectorType value /// class ExtractElementInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + ExtractElementInst(Value *Vec, Value *Idx, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr); @@ -1706,7 +1723,8 @@ public: static ExtractElementInst *Create(Value *Vec, Value *Idx, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - return new(2) ExtractElementInst(Vec, Idx, NameStr, InsertBefore); + return new (AllocMarker) + ExtractElementInst(Vec, Idx, NameStr, InsertBefore); } /// Return true if an extractelement instruction can be @@ -1749,6 +1767,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractElementInst, Value) /// element into a VectorType value /// class InsertElementInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{3}; + InsertElementInst(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr); @@ -1763,7 +1783,8 @@ public: static InsertElementInst *Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - return new(3) InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore); + return new (AllocMarker) + InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore); } /// Return true if an insertelement instruction can be @@ -1813,6 +1834,8 @@ constexpr int PoisonMaskElem = -1; /// For scalable vectors, all the elements of the mask must be 0 or -1. This /// requirement may be relaxed in the future. class ShuffleVectorInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + SmallVector<int, 4> ShuffleMask; Constant *ShuffleMaskForBitcode; @@ -1834,7 +1857,7 @@ public: const Twine &NameStr = "", InsertPosition InsertBefore = nullptr); - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { return User::operator delete(Ptr); } /// Swap the operands and adjust the mask to preserve the semantics @@ -2395,6 +2418,8 @@ ExtractValueInst::ExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs, /// value into an aggregate value. /// class InsertValueInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + SmallVector<unsigned, 4> Indices; InsertValueInst(const InsertValueInst &IVI); @@ -2423,7 +2448,7 @@ protected: public: // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } static InsertValueInst *Create(Value *Agg, Value *Val, @@ -2493,9 +2518,7 @@ struct OperandTraits<InsertValueInst> : InsertValueInst::InsertValueInst(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(Agg->getType(), InsertValue, - OperandTraits<InsertValueInst>::op_begin(this), 2, - InsertBefore) { + : Instruction(Agg->getType(), InsertValue, AllocMarker, InsertBefore) { init(Agg, Val, Idxs, NameStr); } @@ -2510,6 +2533,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertValueInst, Value) // scientist's overactive imagination. // class PHINode : public Instruction { + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + /// The number of operands actually allocated. NumOperands is /// the number actually in use. unsigned ReservedSpace; @@ -2519,7 +2544,7 @@ class PHINode : public Instruction { explicit PHINode(Type *Ty, unsigned NumReservedValues, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) - : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertBefore), + : Instruction(Ty, Instruction::PHI, AllocMarker, InsertBefore), ReservedSpace(NumReservedValues) { assert(!Ty->isTokenTy() && "PHI nodes cannot have token type!"); setName(NameStr); @@ -2545,7 +2570,8 @@ public: static PHINode *Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore); + return new (AllocMarker) + PHINode(Ty, NumReservedValues, NameStr, InsertBefore); } /// Provide fast operand accessors @@ -2749,6 +2775,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(PHINode, Value) class LandingPadInst : public Instruction { using CleanupField = BoolBitfieldElementT<0>; + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + /// The number of operands actually allocated. NumOperands is /// the number actually in use. unsigned ReservedSpace; @@ -2763,7 +2791,7 @@ private: const Twine &NameStr, InsertPosition InsertBefore); // Allocate space for exactly zero operands. - void *operator new(size_t S) { return User::operator new(S); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void growOperands(unsigned Size); void init(unsigned NumReservedValues, const Twine &NameStr); @@ -2843,7 +2871,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(LandingPadInst, Value) /// does not continue in this function any longer. /// class ReturnInst : public Instruction { - ReturnInst(const ReturnInst &RI); + ReturnInst(const ReturnInst &RI, AllocInfo AllocInfo); private: // ReturnInst constructors: @@ -2859,8 +2887,8 @@ private: // // NOTE: If the Value* passed is of type void then the constructor behaves as // if it was passed NULL. - explicit ReturnInst(LLVMContext &C, Value *retVal = nullptr, - InsertPosition InsertBefore = nullptr); + explicit ReturnInst(LLVMContext &C, Value *retVal, AllocInfo AllocInfo, + InsertPosition InsertBefore); protected: // Note: Instruction needs to be a friend here to call cloneImpl. @@ -2871,11 +2899,13 @@ protected: public: static ReturnInst *Create(LLVMContext &C, Value *retVal = nullptr, InsertPosition InsertBefore = nullptr) { - return new(!!retVal) ReturnInst(C, retVal, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{retVal ? 1U : 0U}; + return new (AllocMarker) ReturnInst(C, retVal, AllocMarker, InsertBefore); } static ReturnInst *Create(LLVMContext &C, BasicBlock *InsertAtEnd) { - return new (0) ReturnInst(C, nullptr, InsertAtEnd); + IntrusiveOperandsAllocMarker AllocMarker{0}; + return new (AllocMarker) ReturnInst(C, nullptr, AllocMarker, InsertAtEnd); } /// Provide fast operand accessors @@ -2923,7 +2953,7 @@ class BranchInst : public Instruction { /// [Cond, FalseDest,] TrueDest. This makes some accessors faster because /// they don't have to check for cond/uncond branchness. These are mostly /// accessed relative from op_end(). - BranchInst(const BranchInst &BI); + BranchInst(const BranchInst &BI, AllocInfo AllocInfo); // BranchInst constructors (where {B, T, F} are blocks, and C is a condition): // BranchInst(BB *B) - 'br B' // BranchInst(BB* T, BB *F, Value *C) - 'br C, T, F' @@ -2933,10 +2963,10 @@ class BranchInst : public Instruction { // BranchInst(BB* T, BB *F, Value *C, Inst *I) - 'br C, T, F', insert before I // BranchInst(BB* B, BB *I) - 'br B' insert at end // BranchInst(BB* T, BB *F, Value *C, BB *I) - 'br C, T, F', insert at end - explicit BranchInst(BasicBlock *IfTrue, - InsertPosition InsertBefore = nullptr); + explicit BranchInst(BasicBlock *IfTrue, AllocInfo AllocInfo, + InsertPosition InsertBefore); BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond, - InsertPosition InsertBefore = nullptr); + AllocInfo AllocInfo, InsertPosition InsertBefore); void AssertOK(); @@ -2976,13 +3006,16 @@ public: static BranchInst *Create(BasicBlock *IfTrue, InsertPosition InsertBefore = nullptr) { - return new(1) BranchInst(IfTrue, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{1}; + return new (AllocMarker) BranchInst(IfTrue, AllocMarker, InsertBefore); } static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond, InsertPosition InsertBefore = nullptr) { - return new(3) BranchInst(IfTrue, IfFalse, Cond, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{3}; + return new (AllocMarker) + BranchInst(IfTrue, IfFalse, Cond, AllocMarker, InsertBefore); } /// Transparently provide more efficient getOperand methods. @@ -3054,6 +3087,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value) /// Multiway switch /// class SwitchInst : public Instruction { + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + unsigned ReservedSpace; // Operand[0] = Value to switch on @@ -3070,7 +3105,7 @@ class SwitchInst : public Instruction { InsertPosition InsertBefore); // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void init(Value *Value, BasicBlock *Default, unsigned NumReserved); void growOperands(); @@ -3442,6 +3477,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value) /// Indirect Branch Instruction. /// class IndirectBrInst : public Instruction { + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + unsigned ReservedSpace; // Operand[0] = Address to jump to @@ -3456,7 +3493,7 @@ class IndirectBrInst : public Instruction { InsertPosition InsertBefore); // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void init(Value *Address, unsigned NumDests); void growOperands(); @@ -3576,14 +3613,14 @@ class InvokeInst : public CallBase { /// The index from the end of the operand array to the unwind destination. static constexpr int UnwindDestOpEndIdx = -2; - InvokeInst(const InvokeInst &BI); + InvokeInst(const InvokeInst &BI, AllocInfo AllocInfo); /// Construct an InvokeInst given a range of arguments. /// /// Construct an InvokeInst from a range of arguments inline InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef<Value *> Args, - ArrayRef<OperandBundleDef> Bundles, int NumOperands, + ArrayRef<OperandBundleDef> Bundles, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore); void init(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, @@ -3591,10 +3628,11 @@ class InvokeInst : public CallBase { ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr); /// Compute the number of operands to allocate. - static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) { + static unsigned ComputeNumOperands(unsigned NumArgs, + size_t NumBundleInputs = 0) { // We need one operand for the called function, plus our extra operands and // the input operand counts provided. - return 1 + NumExtraOperands + NumArgs + NumBundleInputs; + return 1 + NumExtraOperands + NumArgs + unsigned(NumBundleInputs); } protected: @@ -3608,10 +3646,11 @@ public: BasicBlock *IfException, ArrayRef<Value *> Args, const Twine &NameStr, InsertPosition InsertBefore = nullptr) { - int NumOperands = ComputeNumOperands(Args.size()); - return new (NumOperands) + IntrusiveOperandsAllocMarker AllocMarker{ + ComputeNumOperands(unsigned(Args.size()))}; + return new (AllocMarker) InvokeInst(Ty, Func, IfNormal, IfException, Args, std::nullopt, - NumOperands, NameStr, InsertBefore); + AllocMarker, NameStr, InsertBefore); } static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, @@ -3619,12 +3658,12 @@ public: ArrayRef<OperandBundleDef> Bundles = std::nullopt, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - int NumOperands = - ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)); - unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)), + unsigned(Bundles.size() * sizeof(BundleOpInfo))}; - return new (NumOperands, DescriptorBytes) - InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, NumOperands, + return new (AllocMarker) + InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, AllocMarker, NameStr, InsertBefore); } @@ -3709,10 +3748,9 @@ private: InvokeInst::InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef<Value *> Args, - ArrayRef<OperandBundleDef> Bundles, int NumOperands, + ArrayRef<OperandBundleDef> Bundles, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : CallBase(Ty->getReturnType(), Instruction::Invoke, - OperandTraits<CallBase>::op_end(this) - NumOperands, NumOperands, + : CallBase(Ty->getReturnType(), Instruction::Invoke, AllocInfo, InsertBefore) { init(Ty, Func, IfNormal, IfException, Args, Bundles, NameStr); } @@ -3729,7 +3767,7 @@ class CallBrInst : public CallBase { unsigned NumIndirectDests; - CallBrInst(const CallBrInst &BI); + CallBrInst(const CallBrInst &BI, AllocInfo AllocInfo); /// Construct a CallBrInst given a range of arguments. /// @@ -3737,7 +3775,7 @@ class CallBrInst : public CallBase { inline CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest, ArrayRef<BasicBlock *> IndirectDests, ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles, - int NumOperands, const Twine &NameStr, + AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore); void init(FunctionType *FTy, Value *Func, BasicBlock *DefaultDest, @@ -3745,11 +3783,11 @@ class CallBrInst : public CallBase { ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr); /// Compute the number of operands to allocate. - static int ComputeNumOperands(int NumArgs, int NumIndirectDests, - int NumBundleInputs = 0) { + static unsigned ComputeNumOperands(int NumArgs, int NumIndirectDests, + int NumBundleInputs = 0) { // We need one operand for the called function, plus our extra operands and // the input operand counts provided. - return 2 + NumIndirectDests + NumArgs + NumBundleInputs; + return unsigned(2 + NumIndirectDests + NumArgs + NumBundleInputs); } protected: @@ -3764,10 +3802,11 @@ public: ArrayRef<BasicBlock *> IndirectDests, ArrayRef<Value *> Args, const Twine &NameStr, InsertPosition InsertBefore = nullptr) { - int NumOperands = ComputeNumOperands(Args.size(), IndirectDests.size()); - return new (NumOperands) + IntrusiveOperandsAllocMarker AllocMarker{ + ComputeNumOperands(Args.size(), IndirectDests.size())}; + return new (AllocMarker) CallBrInst(Ty, Func, DefaultDest, IndirectDests, Args, std::nullopt, - NumOperands, NameStr, InsertBefore); + AllocMarker, NameStr, InsertBefore); } static CallBrInst * @@ -3775,13 +3814,14 @@ public: ArrayRef<BasicBlock *> IndirectDests, ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles = std::nullopt, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - int NumOperands = ComputeNumOperands(Args.size(), IndirectDests.size(), - CountBundleInputs(Bundles)); - unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + ComputeNumOperands(Args.size(), IndirectDests.size(), + CountBundleInputs(Bundles)), + unsigned(Bundles.size() * sizeof(BundleOpInfo))}; - return new (NumOperands, DescriptorBytes) + return new (AllocMarker) CallBrInst(Ty, Func, DefaultDest, IndirectDests, Args, Bundles, - NumOperands, NameStr, InsertBefore); + AllocMarker, NameStr, InsertBefore); } static CallBrInst *Create(FunctionCallee Func, BasicBlock *DefaultDest, @@ -3881,10 +3921,9 @@ private: CallBrInst::CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest, ArrayRef<BasicBlock *> IndirectDests, ArrayRef<Value *> Args, - ArrayRef<OperandBundleDef> Bundles, int NumOperands, + ArrayRef<OperandBundleDef> Bundles, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : CallBase(Ty->getReturnType(), Instruction::CallBr, - OperandTraits<CallBase>::op_end(this) - NumOperands, NumOperands, + : CallBase(Ty->getReturnType(), Instruction::CallBr, AllocInfo, InsertBefore) { init(Ty, Func, DefaultDest, IndirectDests, Args, Bundles, NameStr); } @@ -3897,6 +3936,8 @@ CallBrInst::CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest, /// Resume the propagation of an exception. /// class ResumeInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + ResumeInst(const ResumeInst &RI); explicit ResumeInst(Value *Exn, InsertPosition InsertBefore = nullptr); @@ -3909,7 +3950,7 @@ protected: public: static ResumeInst *Create(Value *Exn, InsertPosition InsertBefore = nullptr) { - return new(1) ResumeInst(Exn, InsertBefore); + return new (AllocMarker) ResumeInst(Exn, InsertBefore); } /// Provide fast operand accessors @@ -3951,6 +3992,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ResumeInst, Value) class CatchSwitchInst : public Instruction { using UnwindDestField = BoolBitfieldElementT<0>; + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + /// The number of operands actually allocated. NumOperands is /// the number actually in use. unsigned ReservedSpace; @@ -3969,7 +4012,7 @@ class CatchSwitchInst : public Instruction { InsertPosition InsertBefore); // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void init(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumReserved); void growOperands(unsigned Size); @@ -4114,9 +4157,9 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CatchSwitchInst, Value) class CleanupPadInst : public FuncletPadInst { private: explicit CleanupPadInst(Value *ParentPad, ArrayRef<Value *> Args, - unsigned Values, const Twine &NameStr, + AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : FuncletPadInst(Instruction::CleanupPad, ParentPad, Args, Values, + : FuncletPadInst(Instruction::CleanupPad, ParentPad, Args, AllocInfo, NameStr, InsertBefore) {} public: @@ -4124,9 +4167,9 @@ public: ArrayRef<Value *> Args = std::nullopt, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - unsigned Values = 1 + Args.size(); - return new (Values) - CleanupPadInst(ParentPad, Args, Values, NameStr, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{unsigned(1 + Args.size())}; + return new (AllocMarker) + CleanupPadInst(ParentPad, Args, AllocMarker, NameStr, InsertBefore); } /// Methods for support type inquiry through isa, cast, and dyn_cast: @@ -4144,18 +4187,18 @@ public: class CatchPadInst : public FuncletPadInst { private: explicit CatchPadInst(Value *CatchSwitch, ArrayRef<Value *> Args, - unsigned Values, const Twine &NameStr, + AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : FuncletPadInst(Instruction::CatchPad, CatchSwitch, Args, Values, + : FuncletPadInst(Instruction::CatchPad, CatchSwitch, Args, AllocInfo, NameStr, InsertBefore) {} public: static CatchPadInst *Create(Value *CatchSwitch, ArrayRef<Value *> Args, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - unsigned Values = 1 + Args.size(); - return new (Values) - CatchPadInst(CatchSwitch, Args, Values, NameStr, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{unsigned(1 + Args.size())}; + return new (AllocMarker) + CatchPadInst(CatchSwitch, Args, AllocMarker, NameStr, InsertBefore); } /// Convenience accessors @@ -4181,6 +4224,8 @@ public: //===----------------------------------------------------------------------===// class CatchReturnInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + CatchReturnInst(const CatchReturnInst &RI); CatchReturnInst(Value *CatchPad, BasicBlock *BB, InsertPosition InsertBefore); @@ -4197,7 +4242,7 @@ public: InsertPosition InsertBefore = nullptr) { assert(CatchPad); assert(BB); - return new (2) CatchReturnInst(CatchPad, BB, InsertBefore); + return new (AllocMarker) CatchReturnInst(CatchPad, BB, InsertBefore); } /// Provide fast operand accessors @@ -4257,9 +4302,9 @@ class CleanupReturnInst : public Instruction { using UnwindDestField = BoolBitfieldElementT<0>; private: - CleanupReturnInst(const CleanupReturnInst &RI); - CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values, - InsertPosition InsertBefore = nullptr); + CleanupReturnInst(const CleanupReturnInst &RI, AllocInfo AllocInfo); + CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, + AllocInfo AllocInfo, InsertPosition InsertBefore = nullptr); void init(Value *CleanupPad, BasicBlock *UnwindBB); @@ -4277,8 +4322,9 @@ public: unsigned Values = 1; if (UnwindBB) ++Values; - return new (Values) - CleanupReturnInst(CleanupPad, UnwindBB, Values, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{Values}; + return new (AllocMarker) + CleanupReturnInst(CleanupPad, UnwindBB, AllocMarker, InsertBefore); } /// Provide fast operand accessors @@ -4350,6 +4396,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CleanupReturnInst, Value) /// end of the block cannot be reached. /// class UnreachableInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{0}; + protected: // Note: Instruction needs to be a friend here to call cloneImpl. friend class Instruction; @@ -4361,7 +4409,7 @@ public: InsertPosition InsertBefore = nullptr); // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S, 0); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } unsigned getNumSuccessors() const { return 0; } diff --git a/llvm/include/llvm/IR/User.h b/llvm/include/llvm/IR/User.h index 910815f..39e1314 100644 --- a/llvm/include/llvm/IR/User.h +++ b/llvm/include/llvm/IR/User.h @@ -43,39 +43,86 @@ struct OperandTraits; class User : public Value { friend struct HungoffOperandTraits; + template <class ConstantClass> friend struct ConstantAggrKeyType; LLVM_ATTRIBUTE_ALWAYS_INLINE static void * allocateFixedOperandUser(size_t, unsigned, unsigned); protected: + // Disable the default operator new, as all subclasses must use one of the + // custom operators below depending on how they store their operands. + void *operator new(size_t Size) = delete; + + /// Indicates this User has operands "hung off" in another allocation. + struct HungOffOperandsAllocMarker {}; + + /// Indicates this User has operands co-allocated. + struct IntrusiveOperandsAllocMarker { + /// The number of operands for this User. + const unsigned NumOps; + }; + + /// Indicates this User has operands and a descriptor co-allocated . + struct IntrusiveOperandsAndDescriptorAllocMarker { + /// The number of operands for this User. + const unsigned NumOps; + /// The number of bytes to allocate for the descriptor. Must be divisible by + /// `sizeof(void *)`. + const unsigned DescBytes; + }; + + /// Information about how a User object was allocated, to be passed into the + /// User constructor. + /// + /// DO NOT USE DIRECTLY. Use one of the `AllocMarker` structs instead, they + /// call all be implicitly converted to `AllocInfo`. + struct AllocInfo { + public: + const unsigned NumOps : NumUserOperandsBits; + const bool HasHungOffUses : 1; + const bool HasDescriptor : 1; + + AllocInfo() = delete; + + constexpr AllocInfo(const HungOffOperandsAllocMarker) + : NumOps(0), HasHungOffUses(true), HasDescriptor(false) {} + + constexpr AllocInfo(const IntrusiveOperandsAllocMarker Alloc) + : NumOps(Alloc.NumOps), HasHungOffUses(false), HasDescriptor(false) {} + + constexpr AllocInfo(const IntrusiveOperandsAndDescriptorAllocMarker Alloc) + : NumOps(Alloc.NumOps), HasHungOffUses(false), + HasDescriptor(Alloc.DescBytes != 0) {} + }; + /// Allocate a User with an operand pointer co-allocated. /// /// This is used for subclasses which need to allocate a variable number /// of operands, ie, 'hung off uses'. - void *operator new(size_t Size); + void *operator new(size_t Size, HungOffOperandsAllocMarker); /// Allocate a User with the operands co-allocated. /// /// This is used for subclasses which have a fixed number of operands. - void *operator new(size_t Size, unsigned Us); + void *operator new(size_t Size, IntrusiveOperandsAllocMarker allocTrait); /// Allocate a User with the operands co-allocated. If DescBytes is non-zero /// then allocate an additional DescBytes bytes before the operands. These /// bytes can be accessed by calling getDescriptor. - /// - /// DescBytes needs to be divisible by sizeof(void *). The allocated - /// descriptor, if any, is aligned to sizeof(void *) bytes. - /// - /// This is used for subclasses which have a fixed number of operands. - void *operator new(size_t Size, unsigned Us, unsigned DescBytes); - - User(Type *ty, unsigned vty, Use *, unsigned NumOps) - : Value(ty, vty) { - assert(NumOps < (1u << NumUserOperandsBits) && "Too many operands"); - NumUserOperands = NumOps; + void *operator new(size_t Size, + IntrusiveOperandsAndDescriptorAllocMarker allocTrait); + + User(Type *ty, unsigned vty, AllocInfo AllocInfo) : Value(ty, vty) { + assert(AllocInfo.NumOps < (1u << NumUserOperandsBits) && + "Too many operands"); + NumUserOperands = AllocInfo.NumOps; + assert((!AllocInfo.HasDescriptor || !AllocInfo.HasHungOffUses) && + "Cannot have both hung off uses and a descriptor"); + HasHungOffUses = AllocInfo.HasHungOffUses; + HasDescriptor = AllocInfo.HasDescriptor; // If we have hung off uses, then the operand list should initially be // null. - assert((!HasHungOffUses || !getOperandList()) && + assert((!AllocInfo.HasHungOffUses || !getOperandList()) && "Error in initializing hung off uses for User"); } @@ -98,7 +145,20 @@ public: /// Free memory allocated for User and Use objects. void operator delete(void *Usr); /// Placement delete - required by std, called if the ctor throws. - void operator delete(void *Usr, unsigned) { + void operator delete(void *Usr, HungOffOperandsAllocMarker) { + // Note: If a subclass manipulates the information which is required to + // calculate the Usr memory pointer, e.g. NumUserOperands, the operator + // delete of that subclass has to restore the changed information to the + // original value, since the dtor of that class is not called if the ctor + // fails. + User::operator delete(Usr); + +#ifndef LLVM_ENABLE_EXCEPTIONS + llvm_unreachable("Constructor throws?"); +#endif + } + /// Placement delete - required by std, called if the ctor throws. + void operator delete(void *Usr, IntrusiveOperandsAllocMarker) { // Note: If a subclass manipulates the information which is required to calculate the // Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has // to restore the changed information to the original value, since the dtor of that class @@ -110,7 +170,7 @@ public: #endif } /// Placement delete - required by std, called if the ctor throws. - void operator delete(void *Usr, unsigned, unsigned) { + void operator delete(void *Usr, IntrusiveOperandsAndDescriptorAllocMarker) { // Note: If a subclass manipulates the information which is required to calculate the // Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has // to restore the changed information to the original value, since the dtor of that class @@ -195,19 +255,6 @@ public: /// Returns the descriptor co-allocated with this User instance. MutableArrayRef<uint8_t> getDescriptor(); - /// Set the number of operands on a GlobalVariable. - /// - /// GlobalVariable always allocates space for a single operands, but - /// doesn't always use it. - /// - /// FIXME: As that the number of operands is used to find the start of - /// the allocated memory in operator delete, we need to always think we have - /// 1 operand before delete. - void setGlobalVariableNumOperands(unsigned NumOps) { - assert(NumOps <= 1 && "GlobalVariable can only have 0 or 1 operands"); - NumUserOperands = NumOps; - } - /// Subclasses with hung off uses need to manage the operand count /// themselves. In these instances, the operand count isn't used to find the /// OperandList, so there's no issue in having the operand count change. diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 2fdbbbd..8888468 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -1506,6 +1506,10 @@ public: static Value *create(Value *Cond, Value *True, Value *False, BasicBlock *InsertAtEnd, Context &Ctx, const Twine &Name = ""); + + const Value *getCondition() const { return getOperand(0); } + const Value *getTrueValue() const { return getOperand(1); } + const Value *getFalseValue() const { return getOperand(2); } Value *getCondition() { return getOperand(0); } Value *getTrueValue() { return getOperand(1); } Value *getFalseValue() { return getOperand(2); } @@ -1513,7 +1517,16 @@ public: void setCondition(Value *New) { setOperand(0, New); } void setTrueValue(Value *New) { setOperand(1, New); } void setFalseValue(Value *New) { setOperand(2, New); } - void swapValues() { cast<llvm::SelectInst>(Val)->swapValues(); } + void swapValues(); + + /// Return a string if the specified operands are invalid for a select + /// operation, otherwise return null. + static const char *areInvalidOperands(Value *Cond, Value *True, + Value *False) { + return llvm::SelectInst::areInvalidOperands(Cond->Val, True->Val, + False->Val); + } + /// For isa/dyn_cast. static bool classof(const Value *From); }; diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h index 14a32cc..66fdcb4 100644 --- a/llvm/include/llvm/Support/Casting.h +++ b/llvm/include/llvm/Support/Casting.h @@ -755,7 +755,7 @@ template <class X, class Y> auto dyn_cast_if_present(Y *Val) { // Forwards to dyn_cast_if_present to avoid breaking current users. This is // deprecated and will be removed in a future patch, use -// cast_if_present instead. +// dyn_cast_if_present instead. template <class X, class Y> auto dyn_cast_or_null(const Y &Val) { return dyn_cast_if_present<X>(Val); } diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index e32a54f..6d035d5 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -1267,9 +1267,9 @@ static Constant *getSequenceIfElementsMatch(Constant *C, } ConstantAggregate::ConstantAggregate(Type *T, ValueTy VT, - ArrayRef<Constant *> V) - : Constant(T, VT, OperandTraits<ConstantAggregate>::op_end(this) - V.size(), - V.size()) { + ArrayRef<Constant *> V, + AllocInfo AllocInfo) + : Constant(T, VT, AllocInfo) { llvm::copy(V, op_begin()); // Check that types match, unless this is an opaque struct. @@ -1282,8 +1282,9 @@ ConstantAggregate::ConstantAggregate(Type *T, ValueTy VT, } } -ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V) - : ConstantAggregate(T, ConstantArrayVal, V) { +ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V, + AllocInfo AllocInfo) + : ConstantAggregate(T, ConstantArrayVal, V, AllocInfo) { assert(V.size() == T->getNumElements() && "Invalid initializer for constant array"); } @@ -1346,8 +1347,9 @@ StructType *ConstantStruct::getTypeForElements(ArrayRef<Constant*> V, return getTypeForElements(V[0]->getContext(), V, Packed); } -ConstantStruct::ConstantStruct(StructType *T, ArrayRef<Constant *> V) - : ConstantAggregate(T, ConstantStructVal, V) { +ConstantStruct::ConstantStruct(StructType *T, ArrayRef<Constant *> V, + AllocInfo AllocInfo) + : ConstantAggregate(T, ConstantStructVal, V, AllocInfo) { assert((T->isOpaque() || V.size() == T->getNumElements()) && "Invalid initializer for constant struct"); } @@ -1388,8 +1390,9 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) { return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V); } -ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V) - : ConstantAggregate(T, ConstantVectorVal, V) { +ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V, + AllocInfo AllocInfo) + : ConstantAggregate(T, ConstantVectorVal, V, AllocInfo) { assert(V.size() == cast<FixedVectorType>(T)->getNumElements() && "Invalid initializer for constant vector"); } @@ -1879,7 +1882,7 @@ BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) { BlockAddress::BlockAddress(Function *F, BasicBlock *BB) : Constant(PointerType::get(F->getContext(), F->getAddressSpace()), - Value::BlockAddressVal, &Op<0>(), 2) { + Value::BlockAddressVal, AllocMarker) { setOperand(0, F); setOperand(1, BB); BB->AdjustBlockAddressRefCount(1); @@ -1951,7 +1954,7 @@ DSOLocalEquivalent *DSOLocalEquivalent::get(GlobalValue *GV) { } DSOLocalEquivalent::DSOLocalEquivalent(GlobalValue *GV) - : Constant(GV->getType(), Value::DSOLocalEquivalentVal, &Op<0>(), 1) { + : Constant(GV->getType(), Value::DSOLocalEquivalentVal, AllocMarker) { setOperand(0, GV); } @@ -2009,7 +2012,7 @@ NoCFIValue *NoCFIValue::get(GlobalValue *GV) { } NoCFIValue::NoCFIValue(GlobalValue *GV) - : Constant(GV->getType(), Value::NoCFIValueVal, &Op<0>(), 1) { + : Constant(GV->getType(), Value::NoCFIValueVal, AllocMarker) { setOperand(0, GV); } @@ -2056,7 +2059,7 @@ ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const { ConstantPtrAuth::ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc, Constant *AddrDisc) - : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, &Op<0>(), 4) { + : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, AllocMarker) { assert(Ptr->getType()->isPointerTy()); assert(Key->getBitWidth() == 32); assert(Disc->getBitWidth() == 64); @@ -2758,11 +2761,8 @@ const char *ConstantExpr::getOpcodeName() const { GetElementPtrConstantExpr::GetElementPtrConstantExpr( Type *SrcElementTy, Constant *C, ArrayRef<Constant *> IdxList, Type *DestTy, - std::optional<ConstantRange> InRange) - : ConstantExpr(DestTy, Instruction::GetElementPtr, - OperandTraits<GetElementPtrConstantExpr>::op_end(this) - - (IdxList.size() + 1), - IdxList.size() + 1), + std::optional<ConstantRange> InRange, AllocInfo AllocInfo) + : ConstantExpr(DestTy, Instruction::GetElementPtr, AllocInfo), SrcElementTy(SrcElementTy), ResElementTy(GetElementPtrInst::getIndexedType(SrcElementTy, IdxList)), InRange(std::move(InRange)) { diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index bd19ec6..6afc86f 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -44,14 +44,16 @@ namespace llvm { /// CastConstantExpr - This class is private to Constants.cpp, and is used /// behind the scenes to implement cast constant exprs. class CastConstantExpr final : public ConstantExpr { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + public: CastConstantExpr(unsigned Opcode, Constant *C, Type *Ty) - : ConstantExpr(Ty, Opcode, &Op<0>(), 1) { + : ConstantExpr(Ty, Opcode, AllocMarker) { Op<0>() = C; } // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); @@ -67,17 +69,19 @@ public: /// BinaryConstantExpr - This class is private to Constants.cpp, and is used /// behind the scenes to implement binary constant exprs. class BinaryConstantExpr final : public ConstantExpr { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: BinaryConstantExpr(unsigned Opcode, Constant *C1, Constant *C2, unsigned Flags) - : ConstantExpr(C1->getType(), Opcode, &Op<0>(), 2) { + : ConstantExpr(C1->getType(), Opcode, AllocMarker) { Op<0>() = C1; Op<1>() = C2; SubclassOptionalData = Flags; } // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -95,16 +99,18 @@ public: /// Constants.cpp, and is used behind the scenes to implement /// extractelement constant exprs. class ExtractElementConstantExpr final : public ConstantExpr { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: ExtractElementConstantExpr(Constant *C1, Constant *C2) - : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(), - Instruction::ExtractElement, &Op<0>(), 2) { + : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(), + Instruction::ExtractElement, AllocMarker) { Op<0>() = C1; Op<1>() = C2; } // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -122,17 +128,18 @@ public: /// Constants.cpp, and is used behind the scenes to implement /// insertelement constant exprs. class InsertElementConstantExpr final : public ConstantExpr { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{3}; + public: InsertElementConstantExpr(Constant *C1, Constant *C2, Constant *C3) - : ConstantExpr(C1->getType(), Instruction::InsertElement, - &Op<0>(), 3) { + : ConstantExpr(C1->getType(), Instruction::InsertElement, AllocMarker) { Op<0>() = C1; Op<1>() = C2; Op<2>() = C3; } // allocate space for exactly three operands - void *operator new(size_t S) { return User::operator new(S, 3); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -150,12 +157,14 @@ public: /// Constants.cpp, and is used behind the scenes to implement /// shufflevector constant exprs. class ShuffleVectorConstantExpr final : public ConstantExpr { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: ShuffleVectorConstantExpr(Constant *C1, Constant *C2, ArrayRef<int> Mask) : ConstantExpr(VectorType::get( cast<VectorType>(C1->getType())->getElementType(), Mask.size(), isa<ScalableVectorType>(C1->getType())), - Instruction::ShuffleVector, &Op<0>(), 2) { + Instruction::ShuffleVector, AllocMarker) { assert(ShuffleVectorInst::isValidOperands(C1, C2, Mask) && "Invalid shuffle vector instruction operands!"); Op<0>() = C1; @@ -168,7 +177,7 @@ public: SmallVector<int, 4> ShuffleMask; Constant *ShuffleMaskForBitcode; - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { return User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -191,15 +200,17 @@ class GetElementPtrConstantExpr : public ConstantExpr { GetElementPtrConstantExpr(Type *SrcElementTy, Constant *C, ArrayRef<Constant *> IdxList, Type *DestTy, - std::optional<ConstantRange> InRange); + std::optional<ConstantRange> InRange, + AllocInfo AllocInfo); public: static GetElementPtrConstantExpr * Create(Type *SrcElementTy, Constant *C, ArrayRef<Constant *> IdxList, Type *DestTy, unsigned Flags, std::optional<ConstantRange> InRange) { - GetElementPtrConstantExpr *Result = new (IdxList.size() + 1) + IntrusiveOperandsAllocMarker AllocMarker{unsigned(IdxList.size() + 1)}; + GetElementPtrConstantExpr *Result = new (AllocMarker) GetElementPtrConstantExpr(SrcElementTy, C, IdxList, DestTy, - std::move(InRange)); + std::move(InRange), AllocMarker); Result->SubclassOptionalData = Flags; return Result; } @@ -318,7 +329,8 @@ template <class ConstantClass> struct ConstantAggrKeyType { using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass; ConstantClass *create(TypeClass *Ty) const { - return new (Operands.size()) ConstantClass(Ty, Operands); + User::IntrusiveOperandsAllocMarker AllocMarker{unsigned(Operands.size())}; + return new (AllocMarker) ConstantClass(Ty, Operands, AllocMarker); } }; diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index afef893..82ff4e1 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -402,7 +402,7 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N, Module *M) { - auto *F = new Function(Ty, Linkage, AddrSpace, N, M); + auto *F = new (AllocMarker) Function(Ty, Linkage, AddrSpace, N, M); AttrBuilder B(F->getContext()); UWTableKind UWTable = M->getUwtable(); if (UWTable != UWTableKind::None) @@ -501,8 +501,7 @@ static unsigned computeAddrSpace(unsigned AddrSpace, Module *M) { Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &name, Module *ParentModule) - : GlobalObject(Ty, Value::FunctionVal, - OperandTraits<Function>::op_begin(this), 0, Linkage, name, + : GlobalObject(Ty, Value::FunctionVal, AllocMarker, Linkage, name, computeAddrSpace(AddrSpace, ParentModule)), NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(UseNewDbgInfoFormat) { assert(FunctionType::isValidReturnType(getReturnType()) && diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp index 2bc69cd..99f4fa5 100644 --- a/llvm/lib/IR/Globals.cpp +++ b/llvm/lib/IR/Globals.cpp @@ -442,9 +442,8 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link, Constant *InitVal, const Twine &Name, ThreadLocalMode TLMode, unsigned AddressSpace, bool isExternallyInitialized) - : GlobalObject(Ty, Value::GlobalVariableVal, - OperandTraits<GlobalVariable>::op_begin(this), - InitVal != nullptr, Link, Name, AddressSpace), + : GlobalObject(Ty, Value::GlobalVariableVal, AllocMarker, Link, Name, + AddressSpace), isConstantGlobal(constant), isExternallyInitializedConstant(isExternallyInitialized) { assert(!Ty->isFunctionTy() && PointerType::isValidElementType(Ty) && @@ -454,6 +453,8 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link, assert(InitVal->getType() == Ty && "Initializer should be the same type as the GlobalVariable!"); Op<0>() = InitVal; + } else { + setGlobalVariableNumOperands(0); } } @@ -540,7 +541,7 @@ void GlobalVariable::setCodeModel(CodeModel::Model CM) { GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link, const Twine &Name, Constant *Aliasee, Module *ParentModule) - : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name, + : GlobalValue(Ty, Value::GlobalAliasVal, AllocMarker, Link, Name, AddressSpace) { setAliasee(Aliasee); if (ParentModule) @@ -597,7 +598,7 @@ const GlobalObject *GlobalAlias::getAliaseeObject() const { GlobalIFunc::GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Link, const Twine &Name, Constant *Resolver, Module *ParentModule) - : GlobalObject(Ty, Value::GlobalIFuncVal, &Op<0>(), 1, Link, Name, + : GlobalObject(Ty, Value::GlobalIFuncVal, AllocMarker, Link, Name, AddressSpace) { setResolver(Resolver); if (ParentModule) diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index 62d88ce..b1c2b02 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -32,9 +32,9 @@ InsertPosition::InsertPosition(Instruction *InsertBefore) InsertPosition::InsertPosition(BasicBlock *InsertAtEnd) : InsertAt(InsertAtEnd ? InsertAtEnd->end() : InstListType::iterator()) {} -Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps, +Instruction::Instruction(Type *ty, unsigned it, AllocInfo AllocInfo, InsertPosition InsertBefore) - : User(ty, Value::InstructionVal + it, Ops, NumOps) { + : User(ty, Value::InstructionVal + it, AllocInfo) { // When called with an iterator, there must be a block to insert into. if (InstListType::iterator InsertIt = InsertBefore; InsertIt.isValid()) { BasicBlock *BB = InsertIt.getNodeParent(); diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 19da1f6..e95b98a 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -121,8 +121,9 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) { //===----------------------------------------------------------------------===// PHINode::PHINode(const PHINode &PN) - : Instruction(PN.getType(), Instruction::PHI, nullptr, PN.getNumOperands()), + : Instruction(PN.getType(), Instruction::PHI, AllocMarker), ReservedSpace(PN.getNumOperands()) { + NumUserOperands = PN.getNumOperands(); allocHungoffUses(PN.getNumOperands()); std::copy(PN.op_begin(), PN.op_end(), op_begin()); copyIncomingBlocks(make_range(PN.block_begin(), PN.block_end())); @@ -243,14 +244,14 @@ bool PHINode::hasConstantOrUndefValue() const { LandingPadInst::LandingPadInst(Type *RetTy, unsigned NumReservedValues, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(RetTy, Instruction::LandingPad, nullptr, 0, InsertBefore) { + : Instruction(RetTy, Instruction::LandingPad, AllocMarker, InsertBefore) { init(NumReservedValues, NameStr); } LandingPadInst::LandingPadInst(const LandingPadInst &LP) - : Instruction(LP.getType(), Instruction::LandingPad, nullptr, - LP.getNumOperands()), + : Instruction(LP.getType(), Instruction::LandingPad, AllocMarker), ReservedSpace(LP.getNumOperands()) { + NumUserOperands = LP.getNumOperands(); allocHungoffUses(LP.getNumOperands()); Use *OL = getOperandList(); const Use *InOL = LP.getOperandList(); @@ -716,16 +717,16 @@ void CallInst::init(FunctionType *FTy, Value *Func, const Twine &NameStr) { } CallInst::CallInst(FunctionType *Ty, Value *Func, const Twine &Name, - InsertPosition InsertBefore) - : CallBase(Ty->getReturnType(), Instruction::Call, - OperandTraits<CallBase>::op_end(this) - 1, 1, InsertBefore) { + AllocInfo AllocInfo, InsertPosition InsertBefore) + : CallBase(Ty->getReturnType(), Instruction::Call, AllocInfo, + InsertBefore) { init(Ty, Func, Name); } -CallInst::CallInst(const CallInst &CI) - : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call, - OperandTraits<CallBase>::op_end(this) - CI.getNumOperands(), - CI.getNumOperands()) { +CallInst::CallInst(const CallInst &CI, AllocInfo AllocInfo) + : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call, AllocInfo) { + assert(getNumOperands() == CI.getNumOperands() && + "Wrong number of operands allocated"); setTailCallKind(CI.getTailCallKind()); setCallingConv(CI.getCallingConv()); @@ -774,7 +775,7 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal, const Twine &NameStr) { this->FTy = FTy; - assert((int)getNumOperands() == + assert(getNumOperands() == ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)) && "NumOperands not set up?"); @@ -803,10 +804,10 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal, setName(NameStr); } -InvokeInst::InvokeInst(const InvokeInst &II) - : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke, - OperandTraits<CallBase>::op_end(this) - II.getNumOperands(), - II.getNumOperands()) { +InvokeInst::InvokeInst(const InvokeInst &II, AllocInfo AllocInfo) + : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke, AllocInfo) { + assert(getNumOperands() == II.getNumOperands() && + "Wrong number of operands allocated"); setCallingConv(II.getCallingConv()); std::copy(II.op_begin(), II.op_end(), op_begin()); std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(), @@ -855,9 +856,9 @@ void CallBrInst::init(FunctionType *FTy, Value *Fn, BasicBlock *Fallthrough, const Twine &NameStr) { this->FTy = FTy; - assert((int)getNumOperands() == - ComputeNumOperands(Args.size(), IndirectDests.size(), - CountBundleInputs(Bundles)) && + assert(getNumOperands() == ComputeNumOperands(Args.size(), + IndirectDests.size(), + CountBundleInputs(Bundles)) && "NumOperands not set up?"); #ifndef NDEBUG @@ -887,10 +888,11 @@ void CallBrInst::init(FunctionType *FTy, Value *Fn, BasicBlock *Fallthrough, setName(NameStr); } -CallBrInst::CallBrInst(const CallBrInst &CBI) +CallBrInst::CallBrInst(const CallBrInst &CBI, AllocInfo AllocInfo) : CallBase(CBI.Attrs, CBI.FTy, CBI.getType(), Instruction::CallBr, - OperandTraits<CallBase>::op_end(this) - CBI.getNumOperands(), - CBI.getNumOperands()) { + AllocInfo) { + assert(getNumOperands() == CBI.getNumOperands() && + "Wrong number of operands allocated"); setCallingConv(CBI.getCallingConv()); std::copy(CBI.op_begin(), CBI.op_end(), op_begin()); std::copy(CBI.bundle_op_info_begin(), CBI.bundle_op_info_end(), @@ -918,19 +920,19 @@ CallBrInst *CallBrInst::Create(CallBrInst *CBI, ArrayRef<OperandBundleDef> OpB, // ReturnInst Implementation //===----------------------------------------------------------------------===// -ReturnInst::ReturnInst(const ReturnInst &RI) +ReturnInst::ReturnInst(const ReturnInst &RI, AllocInfo AllocInfo) : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Ret, - OperandTraits<ReturnInst>::op_end(this) - RI.getNumOperands(), - RI.getNumOperands()) { + AllocInfo) { + assert(getNumOperands() == RI.getNumOperands() && + "Wrong number of operands allocated"); if (RI.getNumOperands()) Op<0>() = RI.Op<0>(); SubclassOptionalData = RI.SubclassOptionalData; } -ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, +ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, AllocInfo AllocInfo, InsertPosition InsertBefore) - : Instruction(Type::getVoidTy(C), Instruction::Ret, - OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal, + : Instruction(Type::getVoidTy(C), Instruction::Ret, AllocInfo, InsertBefore) { if (retVal) Op<0>() = retVal; @@ -942,13 +944,13 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, ResumeInst::ResumeInst(const ResumeInst &RI) : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Resume, - OperandTraits<ResumeInst>::op_begin(this), 1) { + AllocMarker) { Op<0>() = RI.Op<0>(); } ResumeInst::ResumeInst(Value *Exn, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(Exn->getContext()), Instruction::Resume, - OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) { + AllocMarker, InsertBefore) { Op<0>() = Exn; } @@ -956,11 +958,11 @@ ResumeInst::ResumeInst(Value *Exn, InsertPosition InsertBefore) // CleanupReturnInst Implementation //===----------------------------------------------------------------------===// -CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI) - : Instruction(CRI.getType(), Instruction::CleanupRet, - OperandTraits<CleanupReturnInst>::op_end(this) - - CRI.getNumOperands(), - CRI.getNumOperands()) { +CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI, + AllocInfo AllocInfo) + : Instruction(CRI.getType(), Instruction::CleanupRet, AllocInfo) { + assert(getNumOperands() == CRI.getNumOperands() && + "Wrong number of operands allocated"); setSubclassData<Instruction::OpaqueField>( CRI.getSubclassData<Instruction::OpaqueField>()); Op<0>() = CRI.Op<0>(); @@ -978,12 +980,10 @@ void CleanupReturnInst::init(Value *CleanupPad, BasicBlock *UnwindBB) { } CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, - unsigned Values, + AllocInfo AllocInfo, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(CleanupPad->getContext()), - Instruction::CleanupRet, - OperandTraits<CleanupReturnInst>::op_end(this) - Values, - Values, InsertBefore) { + Instruction::CleanupRet, AllocInfo, InsertBefore) { init(CleanupPad, UnwindBB); } @@ -997,7 +997,7 @@ void CatchReturnInst::init(Value *CatchPad, BasicBlock *BB) { CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI) : Instruction(Type::getVoidTy(CRI.getContext()), Instruction::CatchRet, - OperandTraits<CatchReturnInst>::op_begin(this), 2) { + AllocMarker) { Op<0>() = CRI.Op<0>(); Op<1>() = CRI.Op<1>(); } @@ -1005,8 +1005,7 @@ CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI) CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(BB->getContext()), Instruction::CatchRet, - OperandTraits<CatchReturnInst>::op_begin(this), 2, - InsertBefore) { + AllocMarker, InsertBefore) { init(CatchPad, BB); } @@ -1018,7 +1017,7 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumReservedValues, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0, + : Instruction(ParentPad->getType(), Instruction::CatchSwitch, AllocMarker, InsertBefore) { if (UnwindDest) ++NumReservedValues; @@ -1027,8 +1026,8 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest, } CatchSwitchInst::CatchSwitchInst(const CatchSwitchInst &CSI) - : Instruction(CSI.getType(), Instruction::CatchSwitch, nullptr, - CSI.getNumOperands()) { + : Instruction(CSI.getType(), Instruction::CatchSwitch, AllocMarker) { + NumUserOperands = CSI.NumUserOperands; init(CSI.getParentPad(), CSI.getUnwindDest(), CSI.getNumOperands()); setNumHungOffUseOperands(ReservedSpace); Use *OL = getOperandList(); @@ -1093,22 +1092,19 @@ void FuncletPadInst::init(Value *ParentPad, ArrayRef<Value *> Args, setName(NameStr); } -FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI) - : Instruction(FPI.getType(), FPI.getOpcode(), - OperandTraits<FuncletPadInst>::op_end(this) - - FPI.getNumOperands(), - FPI.getNumOperands()) { +FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI, AllocInfo AllocInfo) + : Instruction(FPI.getType(), FPI.getOpcode(), AllocInfo) { + assert(getNumOperands() == FPI.getNumOperands() && + "Wrong number of operands allocated"); std::copy(FPI.op_begin(), FPI.op_end(), op_begin()); setParentPad(FPI.getParentPad()); } FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, - ArrayRef<Value *> Args, unsigned Values, + ArrayRef<Value *> Args, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(ParentPad->getType(), Op, - OperandTraits<FuncletPadInst>::op_end(this) - Values, Values, - InsertBefore) { + : Instruction(ParentPad->getType(), Op, AllocInfo, InsertBefore) { init(ParentPad, Args, NameStr); } @@ -1118,8 +1114,8 @@ FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, UnreachableInst::UnreachableInst(LLVMContext &Context, InsertPosition InsertBefore) - : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr, - 0, InsertBefore) {} + : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, + AllocMarker, InsertBefore) {} //===----------------------------------------------------------------------===// // BranchInst Implementation @@ -1131,19 +1127,18 @@ void BranchInst::AssertOK() { "May only branch on boolean predicates!"); } -BranchInst::BranchInst(BasicBlock *IfTrue, InsertPosition InsertBefore) +BranchInst::BranchInst(BasicBlock *IfTrue, AllocInfo AllocInfo, + InsertPosition InsertBefore) : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br, - OperandTraits<BranchInst>::op_end(this) - 1, 1, - InsertBefore) { + AllocInfo, InsertBefore) { assert(IfTrue && "Branch destination may not be null!"); Op<-1>() = IfTrue; } BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond, - InsertPosition InsertBefore) + AllocInfo AllocInfo, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br, - OperandTraits<BranchInst>::op_end(this) - 3, 3, - InsertBefore) { + AllocInfo, InsertBefore) { // Assign in order of operand index to make use-list order predictable. Op<-3>() = Cond; Op<-2>() = IfFalse; @@ -1153,10 +1148,11 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond, #endif } -BranchInst::BranchInst(const BranchInst &BI) +BranchInst::BranchInst(const BranchInst &BI, AllocInfo AllocInfo) : Instruction(Type::getVoidTy(BI.getContext()), Instruction::Br, - OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(), - BI.getNumOperands()) { + AllocInfo) { + assert(getNumOperands() == BI.getNumOperands() && + "Wrong number of operands allocated"); // Assign in order of operand index to make use-list order predictable. if (BI.getNumOperands() != 1) { assert(BI.getNumOperands() == 3 && "BR can have 1 or 3 operands!"); @@ -1313,9 +1309,8 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align, StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align, AtomicOrdering Order, SyncScope::ID SSID, InsertPosition InsertBefore) - : Instruction(Type::getVoidTy(val->getContext()), Store, - OperandTraits<StoreInst>::op_begin(this), - OperandTraits<StoreInst>::operands(this), InsertBefore) { + : Instruction(Type::getVoidTy(val->getContext()), Store, AllocMarker, + InsertBefore) { Op<0>() = val; Op<1>() = addr; setVolatile(isVolatile); @@ -1356,8 +1351,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal, InsertPosition InsertBefore) : Instruction( StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())), - AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this), - OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) { + AtomicCmpXchg, AllocMarker, InsertBefore) { Init(Ptr, Cmp, NewVal, Alignment, SuccessOrdering, FailureOrdering, SSID); } @@ -1389,9 +1383,7 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val, AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val, Align Alignment, AtomicOrdering Ordering, SyncScope::ID SSID, InsertPosition InsertBefore) - : Instruction(Val->getType(), AtomicRMW, - OperandTraits<AtomicRMWInst>::op_begin(this), - OperandTraits<AtomicRMWInst>::operands(this), InsertBefore) { + : Instruction(Val->getType(), AtomicRMW, AllocMarker, InsertBefore) { Init(Operation, Ptr, Val, Alignment, Ordering, SSID); } @@ -1448,7 +1440,7 @@ StringRef AtomicRMWInst::getOperationName(BinOp Op) { FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, SyncScope::ID SSID, InsertPosition InsertBefore) - : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertBefore) { + : Instruction(Type::getVoidTy(C), Fence, AllocMarker, InsertBefore) { setOrdering(Ordering); setSyncScopeID(SSID); } @@ -1466,13 +1458,13 @@ void GetElementPtrInst::init(Value *Ptr, ArrayRef<Value *> IdxList, setName(Name); } -GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI) - : Instruction(GEPI.getType(), GetElementPtr, - OperandTraits<GetElementPtrInst>::op_end(this) - - GEPI.getNumOperands(), - GEPI.getNumOperands()), +GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI, + AllocInfo AllocInfo) + : Instruction(GEPI.getType(), GetElementPtr, AllocInfo), SourceElementType(GEPI.SourceElementType), ResultElementType(GEPI.ResultElementType) { + assert(getNumOperands() == GEPI.getNumOperands() && + "Wrong number of operands allocated"); std::copy(GEPI.op_begin(), GEPI.op_end(), op_begin()); SubclassOptionalData = GEPI.SubclassOptionalData; } @@ -1606,9 +1598,8 @@ bool GetElementPtrInst::collectOffset( ExtractElementInst::ExtractElementInst(Value *Val, Value *Index, const Twine &Name, InsertPosition InsertBef) - : Instruction( - cast<VectorType>(Val->getType())->getElementType(), ExtractElement, - OperandTraits<ExtractElementInst>::op_begin(this), 2, InsertBef) { + : Instruction(cast<VectorType>(Val->getType())->getElementType(), + ExtractElement, AllocMarker, InsertBef) { assert(isValidOperands(Val, Index) && "Invalid extractelement instruction operands!"); Op<0>() = Val; @@ -1629,9 +1620,7 @@ bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) { InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, Value *Index, const Twine &Name, InsertPosition InsertBef) - : Instruction(Vec->getType(), InsertElement, - OperandTraits<InsertElementInst>::op_begin(this), 3, - InsertBef) { + : Instruction(Vec->getType(), InsertElement, AllocMarker, InsertBef) { assert(isValidOperands(Vec, Elt, Index) && "Invalid insertelement instruction operands!"); Op<0>() = Vec; @@ -1679,8 +1668,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask, : Instruction( VectorType::get(cast<VectorType>(V1->getType())->getElementType(), cast<VectorType>(Mask->getType())->getElementCount()), - ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this), - OperandTraits<ShuffleVectorInst>::operands(this), InsertBefore) { + ShuffleVector, AllocMarker, InsertBefore) { assert(isValidOperands(V1, V2, Mask) && "Invalid shuffle vector instruction operands!"); @@ -1698,8 +1686,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask, : Instruction( VectorType::get(cast<VectorType>(V1->getType())->getElementType(), Mask.size(), isa<ScalableVectorType>(V1->getType())), - ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this), - OperandTraits<ShuffleVectorInst>::operands(this), InsertBefore) { + ShuffleVector, AllocMarker, InsertBefore) { assert(isValidOperands(V1, V2, Mask) && "Invalid shuffle vector instruction operands!"); Op<0>() = V1; @@ -2464,9 +2451,8 @@ void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs, } InsertValueInst::InsertValueInst(const InsertValueInst &IVI) - : Instruction(IVI.getType(), InsertValue, - OperandTraits<InsertValueInst>::op_begin(this), 2), - Indices(IVI.Indices) { + : Instruction(IVI.getType(), InsertValue, AllocMarker), + Indices(IVI.Indices) { Op<0>() = IVI.getOperand(0); Op<1>() = IVI.getOperand(1); SubclassOptionalData = IVI.SubclassOptionalData; @@ -2565,8 +2551,7 @@ void UnaryOperator::AssertOK() { BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty, const Twine &Name, InsertPosition InsertBefore) - : Instruction(Ty, iType, OperandTraits<BinaryOperator>::op_begin(this), - OperandTraits<BinaryOperator>::operands(this), InsertBefore) { + : Instruction(Ty, iType, AllocMarker, InsertBefore) { Op<0>() = S1; Op<1>() = S2; setName(Name); @@ -3427,8 +3412,7 @@ AddrSpaceCastInst::AddrSpaceCastInst(Value *S, Type *Ty, const Twine &Name, CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS, Value *RHS, const Twine &Name, InsertPosition InsertBefore, Instruction *FlagsSource) - : Instruction(ty, op, OperandTraits<CmpInst>::op_begin(this), - OperandTraits<CmpInst>::operands(this), InsertBefore) { + : Instruction(ty, op, AllocMarker, InsertBefore) { Op<0>() = LHS; Op<1>() = RHS; setPredicate((Predicate)predicate); @@ -3918,12 +3902,12 @@ void SwitchInst::init(Value *Value, BasicBlock *Default, unsigned NumReserved) { SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(Value->getContext()), Instruction::Switch, - nullptr, 0, InsertBefore) { + AllocMarker, InsertBefore) { init(Value, Default, 2+NumCases*2); } SwitchInst::SwitchInst(const SwitchInst &SI) - : Instruction(SI.getType(), Instruction::Switch, nullptr, 0) { + : Instruction(SI.getType(), Instruction::Switch, AllocMarker) { init(SI.getCondition(), SI.getDefaultDest(), SI.getNumOperands()); setNumHungOffUseOperands(SI.getNumOperands()); Use *OL = getOperandList(); @@ -4125,13 +4109,14 @@ void IndirectBrInst::growOperands() { IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(Address->getContext()), - Instruction::IndirectBr, nullptr, 0, InsertBefore) { + Instruction::IndirectBr, AllocMarker, InsertBefore) { init(Address, NumCases); } IndirectBrInst::IndirectBrInst(const IndirectBrInst &IBI) : Instruction(Type::getVoidTy(IBI.getContext()), Instruction::IndirectBr, - nullptr, IBI.getNumOperands()) { + AllocMarker) { + NumUserOperands = IBI.NumUserOperands; allocHungoffUses(IBI.getNumOperands()); Use *OL = getOperandList(); const Use *InOL = IBI.getOperandList(); @@ -4185,7 +4170,8 @@ FreezeInst::FreezeInst(Value *S, const Twine &Name, InsertPosition InsertBefore) // unit that uses these classes. GetElementPtrInst *GetElementPtrInst::cloneImpl() const { - return new (getNumOperands()) GetElementPtrInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) GetElementPtrInst(*this, AllocMarker); } UnaryOperator *UnaryOperator::cloneImpl() const { @@ -4305,10 +4291,13 @@ AddrSpaceCastInst *AddrSpaceCastInst::cloneImpl() const { CallInst *CallInst::cloneImpl() const { if (hasOperandBundles()) { - unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo); - return new(getNumOperands(), DescriptorBytes) CallInst(*this); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + getNumOperands(), + getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))}; + return new (AllocMarker) CallInst(*this, AllocMarker); } - return new(getNumOperands()) CallInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) CallInst(*this, AllocMarker); } SelectInst *SelectInst::cloneImpl() const { @@ -4331,18 +4320,20 @@ ShuffleVectorInst *ShuffleVectorInst::cloneImpl() const { return new ShuffleVectorInst(getOperand(0), getOperand(1), getShuffleMask()); } -PHINode *PHINode::cloneImpl() const { return new PHINode(*this); } +PHINode *PHINode::cloneImpl() const { return new (AllocMarker) PHINode(*this); } LandingPadInst *LandingPadInst::cloneImpl() const { return new LandingPadInst(*this); } ReturnInst *ReturnInst::cloneImpl() const { - return new(getNumOperands()) ReturnInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) ReturnInst(*this, AllocMarker); } BranchInst *BranchInst::cloneImpl() const { - return new(getNumOperands()) BranchInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) BranchInst(*this, AllocMarker); } SwitchInst *SwitchInst::cloneImpl() const { return new SwitchInst(*this); } @@ -4353,28 +4344,37 @@ IndirectBrInst *IndirectBrInst::cloneImpl() const { InvokeInst *InvokeInst::cloneImpl() const { if (hasOperandBundles()) { - unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo); - return new(getNumOperands(), DescriptorBytes) InvokeInst(*this); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + getNumOperands(), + getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))}; + return new (AllocMarker) InvokeInst(*this, AllocMarker); } - return new(getNumOperands()) InvokeInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) InvokeInst(*this, AllocMarker); } CallBrInst *CallBrInst::cloneImpl() const { if (hasOperandBundles()) { - unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo); - return new (getNumOperands(), DescriptorBytes) CallBrInst(*this); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + getNumOperands(), + getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))}; + return new (AllocMarker) CallBrInst(*this, AllocMarker); } - return new (getNumOperands()) CallBrInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) CallBrInst(*this, AllocMarker); } -ResumeInst *ResumeInst::cloneImpl() const { return new (1) ResumeInst(*this); } +ResumeInst *ResumeInst::cloneImpl() const { + return new (AllocMarker) ResumeInst(*this); +} CleanupReturnInst *CleanupReturnInst::cloneImpl() const { - return new (getNumOperands()) CleanupReturnInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) CleanupReturnInst(*this, AllocMarker); } CatchReturnInst *CatchReturnInst::cloneImpl() const { - return new (getNumOperands()) CatchReturnInst(*this); + return new (AllocMarker) CatchReturnInst(*this); } CatchSwitchInst *CatchSwitchInst::cloneImpl() const { @@ -4382,7 +4382,8 @@ CatchSwitchInst *CatchSwitchInst::cloneImpl() const { } FuncletPadInst *FuncletPadInst::cloneImpl() const { - return new (getNumOperands()) FuncletPadInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) FuncletPadInst(*this, AllocMarker); } UnreachableInst *UnreachableInst::cloneImpl() const { diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp index 00dd9c7..b0aa785 100644 --- a/llvm/lib/IR/User.cpp +++ b/llvm/lib/IR/User.cpp @@ -145,10 +145,7 @@ void *User::allocateFixedOperandUser(size_t Size, unsigned Us, ::operator new(Size + sizeof(Use) * Us + DescBytesToAllocate)); Use *Start = reinterpret_cast<Use *>(Storage + DescBytesToAllocate); Use *End = Start + Us; - User *Obj = reinterpret_cast<User*>(End); - Obj->NumUserOperands = Us; - Obj->HasHungOffUses = false; - Obj->HasDescriptor = DescBytes != 0; + User *Obj = reinterpret_cast<User *>(End); for (; Start != End; Start++) new (Start) Use(Obj); @@ -160,22 +157,21 @@ void *User::allocateFixedOperandUser(size_t Size, unsigned Us, return Obj; } -void *User::operator new(size_t Size, unsigned Us) { - return allocateFixedOperandUser(Size, Us, 0); +void *User::operator new(size_t Size, IntrusiveOperandsAllocMarker allocTrait) { + return allocateFixedOperandUser(Size, allocTrait.NumOps, 0); } -void *User::operator new(size_t Size, unsigned Us, unsigned DescBytes) { - return allocateFixedOperandUser(Size, Us, DescBytes); +void *User::operator new(size_t Size, + IntrusiveOperandsAndDescriptorAllocMarker allocTrait) { + return allocateFixedOperandUser(Size, allocTrait.NumOps, + allocTrait.DescBytes); } -void *User::operator new(size_t Size) { +void *User::operator new(size_t Size, HungOffOperandsAllocMarker) { // Allocate space for a single Use* void *Storage = ::operator new(Size + sizeof(Use *)); Use **HungOffOperandList = static_cast<Use **>(Storage); User *Obj = reinterpret_cast<User *>(HungOffOperandList + 1); - Obj->NumUserOperands = 0; - Obj->HasHungOffUses = true; - Obj->HasDescriptor = false; *HungOffOperandList = nullptr; return Obj; } diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 6ebf262..8f151a9 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1181,8 +1181,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, // Enable contextual profiling instrumentation. const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink && PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled(); - const bool IsCtxProfUse = !UseCtxProfile.empty() && !PGOOpt && - Phase == ThinOrFullLTOPhase::ThinLTOPreLink; + const bool IsCtxProfUse = + !UseCtxProfile.empty() && Phase == ThinOrFullLTOPhase::ThinLTOPreLink; if (IsPGOInstrGen || IsPGOInstrUse || IsMemprofUse || IsCtxProfGen || IsCtxProfUse) @@ -1673,7 +1673,7 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { // In pre-link, for ctx prof use, we stop here with an instrumented IR. We let // thinlto use the contextual info to perform imports; then use the contextual // profile in the post-thinlink phase. - if (!UseCtxProfile.empty() && !PGOOpt) { + if (!UseCtxProfile.empty()) { addRequiredLTOPreLinkPasses(MPM); return MPM; } diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 18fdcda..df38395 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -662,6 +662,12 @@ Value *SelectInst::create(Value *Cond, Value *True, Value *False, return createCommon(Cond, True, False, Name, Builder, Ctx); } +void SelectInst::swapValues() { + Ctx.getTracker().emplaceIfTracking<UseSwap>(getOperandUse(1), + getOperandUse(2)); + cast<llvm::SelectInst>(Val)->swapValues(); +} + bool SelectInst::classof(const Value *From) { return From->getSubclassID() == ClassID::Select; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 4da3618..9f89264 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -353,23 +353,20 @@ bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I, } /// Match an fpext from half to float, or a constant we can convert. -static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) { - if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc))))) - return FPExtSrc->getType()->isHalfTy(); - - ConstantFP *CFP; - if (match(Arg, m_ConstantFP(CFP))) { +static Value *matchFPExtFromF16(Value *Arg) { + Value *Src = nullptr; + ConstantFP *CFP = nullptr; + if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) { + if (Src->getType()->isHalfTy()) + return Src; + } else if (match(Arg, m_ConstantFP(CFP))) { bool LosesInfo; APFloat Val(CFP->getValueAPF()); Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); - if (LosesInfo) - return false; - - FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val); - return true; + if (!LosesInfo) + return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val); } - - return false; + return nullptr; } // Trim all zero components from the end of the vector \p UseV and return @@ -839,15 +836,16 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (!ST->hasMed3_16()) break; - Value *X, *Y, *Z; - // Repeat floating-point width reduction done for minnum/maxnum. // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z)) - if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) && - matchFPExtFromF16(Src2, Z)) { - Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()}, - {X, Y, Z}, &II, II.getName()); - return new FPExtInst(NewCall, II.getType()); + if (Value *X = matchFPExtFromF16(Src0)) { + if (Value *Y = matchFPExtFromF16(Src1)) { + if (Value *Z = matchFPExtFromF16(Src2)) { + Value *NewCall = IC.Builder.CreateIntrinsic( + IID, {X->getType()}, {X, Y, Z}, &II, II.getName()); + return new FPExtInst(NewCall, II.getType()); + } + } } break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 56f4efd..e657f66 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5439,6 +5439,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || IID == Intrinsic::amdgcn_permlanex16; + bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || + IID == Intrinsic::amdgcn_set_inactive_chain_arg; auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1, Register Src2, LLT VT) -> Register { @@ -5448,6 +5450,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, case Intrinsic::amdgcn_permlane64: return LaneOp.getReg(0); case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_set_inactive_chain_arg: return LaneOp.addUse(Src1).getReg(0); case Intrinsic::amdgcn_writelane: return LaneOp.addUse(Src1).addUse(Src2).getReg(0); @@ -5472,7 +5476,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, Register Src0 = MI.getOperand(2).getReg(); Register Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || - IsPermLane16) { + IsSetInactive || IsPermLane16) { Src1 = MI.getOperand(3).getReg(); if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) { Src2 = MI.getOperand(4).getReg(); @@ -5490,7 +5494,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, if (Size < 32) { Src0 = B.buildAnyExt(S32, Src0).getReg(0); - if (IsPermLane16) + if (IsSetInactive || IsPermLane16) Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0); if (IID == Intrinsic::amdgcn_writelane) @@ -5526,7 +5530,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0); MachineInstrBuilder Src1Parts, Src2Parts; - if (IsPermLane16) + if (IsSetInactive || IsPermLane16) Src1Parts = B.buildUnmerge(PartialResTy, Src1); if (IID == Intrinsic::amdgcn_writelane) @@ -5535,7 +5539,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, for (unsigned i = 0; i < NumParts; ++i) { Src0 = Src0Parts.getReg(i); - if (IsPermLane16) + if (IsSetInactive || IsPermLane16) Src1 = Src1Parts.getReg(i); if (IID == Intrinsic::amdgcn_writelane) @@ -7496,6 +7500,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_permlane16: case Intrinsic::amdgcn_permlanex16: case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_set_inactive_chain_arg: return legalizeLaneOp(Helper, MI, IntrID); case Intrinsic::amdgcn_s_buffer_prefetch_data: return legalizeSBufferPrefetch(Helper, MI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9a6d617..62e22c1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6102,6 +6102,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, unsigned IID = N->getConstantOperandVal(0); bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || IID == Intrinsic::amdgcn_permlanex16; + bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || + IID == Intrinsic::amdgcn_set_inactive_chain_arg; SDLoc SL(N); MVT IntVT = MVT::getIntegerVT(ValSize); @@ -6119,6 +6121,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, Operands.push_back(Src2); [[fallthrough]]; case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_set_inactive_chain_arg: Operands.push_back(Src1); [[fallthrough]]; case Intrinsic::amdgcn_readfirstlane: @@ -6145,7 +6149,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SDValue Src0 = N->getOperand(1); SDValue Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || - IsPermLane16) { + IsSetInactive || IsPermLane16) { Src1 = N->getOperand(2); if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) Src2 = N->getOperand(3); @@ -6161,7 +6165,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, SL, MVT::i32); - if (IsPermLane16) { + if (IsSetInactive || IsPermLane16) { Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1, SL, MVT::i32); } @@ -6237,7 +6241,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0, DAG.getConstant(EltIdx, SL, MVT::i32)); - if (IsPermLane16) + if (IsSetInactive || IsPermLane16) Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, DAG.getConstant(EltIdx, SL, MVT::i32)); @@ -6246,7 +6250,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, DAG.getConstant(EltIdx, SL, MVT::i32)); Pieces.push_back( - IsPermLane16 + IsSetInactive || IsPermLane16 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); EltIdx += 2; @@ -6262,7 +6266,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); Src0 = DAG.getBitcast(VecVT, Src0); - if (IsPermLane16) + if (IsSetInactive || IsPermLane16) Src1 = DAG.getBitcast(VecVT, Src1); if (IID == Intrinsic::amdgcn_writelane) @@ -8745,6 +8749,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_permlane16: case Intrinsic::amdgcn_permlanex16: case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_set_inactive_chain_arg: return lowerLaneOp(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 899d937..e4a679f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2097,21 +2097,6 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { } } -Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) { - assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || - MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64); - for (auto &Op : MI.implicit_operands()) { - if (Op.isDef()) - continue; - Register OpReg = Op.getReg(); - if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO || - OpReg == AMDGPU::SCC) - continue; - return OpReg; - } - return Register(); -} - bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); @@ -2286,147 +2271,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } - case AMDGPU::V_SET_INACTIVE_B32: - case AMDGPU::V_SET_INACTIVE_B64: { - unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64 - ? AMDGPU::V_MOV_B64_PSEUDO - : AMDGPU::V_MOV_B32_e32; - Register ExecReg = RI.getExec(); + case AMDGPU::V_SET_INACTIVE_B32: { + // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32. Register DstReg = MI.getOperand(0).getReg(); - MachineOperand &ActiveSrc = MI.getOperand(1); - MachineOperand &InactiveSrc = MI.getOperand(2); - - // Find implicit register defining lanes active outside WWM. - Register ExecSrcReg = findSetInactiveMask(MI); - assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region"); - // Note: default here is set to ExecReg so that functional MIR is still - // generated if implicit def is not found and assertions are disabled. - if (!ExecSrcReg) - ExecSrcReg = ExecReg; - - // Ideally in WWM this operation is lowered to V_CNDMASK; however, - // constant bus constraints and the presence of literal constants - // present an issue. - // Fallback to V_MOV base lowering in all but the common cases. - const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32; - MachineFunction *MF = MBB.getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64; - const MCInstrDesc &Desc = get(Opcode); - - const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0); - const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0); - const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue()); - const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue()); - const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue()); - const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue()); - - int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); - - int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64); - int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; - int ConstantBusUses = - 1 + // Starts at 1 for ExecSrcReg - (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) + - (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0); - int LiteralConstants = - ((ActiveSrc.isReg() || - (ActiveSrc.isImm() && isInlineConstant(ActiveImm))) - ? 0 - : 1) + - ((InactiveSrc.isReg() || - (InactiveSrc.isImm() && isInlineConstant(InactiveImm))) - ? 0 - : 1); - - bool UseVCndMask = - ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit; - if (VMov64 && UseVCndMask) { - // Decomposition must not introduce new literals. - UseVCndMask &= - ActiveSrc.isReg() || - (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) || - (!isInlineConstant(ActiveImm)); - UseVCndMask &= InactiveSrc.isReg() || - (isInlineConstant(InactiveImmLo) && - isInlineConstant(InactiveImmHi)) || - (!isInlineConstant(InactiveImm)); - } - - if (UseVCndMask && VMov64) { - // Dual V_CNDMASK_B32 - MachineOperand ActiveLo = buildExtractSubRegOrImm( - MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr); - MachineOperand ActiveHi = buildExtractSubRegOrImm( - MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr); - MachineOperand InactiveLo = buildExtractSubRegOrImm( - MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr); - MachineOperand InactiveHi = buildExtractSubRegOrImm( - MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr); - if (ActiveSrc.isReg()) - ActiveHi.setIsKill(ActiveSrc.isKill()); - if (InactiveSrc.isReg()) - InactiveHi.setIsKill(InactiveSrc.isKill()); - BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0)) - .addImm(0) - .add(InactiveLo) - .addImm(0) - .add(ActiveLo) - .addReg(ExecSrcReg) - .addReg(DstReg, RegState::ImplicitDefine); - BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1)) - .addImm(0) - .add(InactiveHi) - .addImm(0) - .add(ActiveHi) - .addReg(ExecSrcReg) - .addReg(DstReg, RegState::ImplicitDefine); - } else if (UseVCndMask) { - // Single V_CNDMASK_B32 - BuildMI(MBB, MI, DL, Desc, DstReg) - .addImm(0) - .add(InactiveSrc) - .addImm(0) - .add(ActiveSrc) - .addReg(ExecSrcReg); - } else { - // Fallback V_MOV case. - // Avoid unnecessary work if a source VGPR is also the destination. - // This can happen if WWM register allocation was efficient. - // Note: this assumes WWM execution. - bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg; - bool DstIsInactive = - InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg; - if (!DstIsInactive) { - // Set exec mask to inactive lanes, - // but only if active lanes would be overwritten. - if (DstIsActive) { - BuildMI(MBB, MI, DL, get(NotOpc), ExecReg) - .addReg(ExecSrcReg) - .setOperandDead(3); // Dead scc - } - // Copy inactive lanes - MachineInstr *VMov = - BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc); - if (VMov64) - expandPostRAPseudo(*VMov); - } - if (!DstIsActive) { - // Set exec mask to active lanes - BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg); - // Copy active lanes - MachineInstr *VMov = - BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg()) - .add(ActiveSrc); - if (VMov64) - expandPostRAPseudo(*VMov); - } - // Restore WWM - BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1); - } + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(5)); MI.eraseFromParent(); break; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 7143251..4fd9b43 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1437,8 +1437,6 @@ public: // This is used if an operand is a 32 bit register but needs to be aligned // regardless. void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const; - - static Register findSetInactiveMask(const MachineInstr &MI); }; /// \brief Returns true if a reg:subreg pair P has a TRC class diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index f9f0796..284be72 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -243,29 +243,16 @@ def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))), // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. -let Defs = [SCC], isConvergent = 1 in { -def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), - (ins VSrc_b32: $src, VSrc_b32:$inactive), []>; - -def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), - (ins VSrc_b64: $src, VSrc_b64:$inactive), []>; -} // End Defs = [SCC] +let isConvergent = 1 in +def V_SET_INACTIVE_B32 : VOP3_Pseudo<"v_set_inactive_b32", VOP2e_I32_I32_I32_I1>; foreach vt = Reg32Types.types in { def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), - (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>; -} - -foreach vt = Reg64Types.types in { -def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), - (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>; + (V_SET_INACTIVE_B32 0, VSrc_b32:$src, 0, VSrc_b32:$inactive, (IMPLICIT_DEF))>; } def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), - (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>; - -def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)), - (V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>; + (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>; let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index 29fef49..3bf2ea0 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -215,8 +215,7 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock *MBB : RPOT) { bool InWWM = false; for (MachineInstr &MI : *MBB) { - if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || - MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64) + if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32) RegsAssigned |= processDef(MI.getOperand(0)); if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) { diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 38ebda6..8cedc34 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -557,26 +557,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // This avoid unnecessarily marking M0 as requiring WQM. III.Needs |= StateStrictWQM; GlobalFlags |= StateStrictWQM; - } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || - Opcode == AMDGPU::V_SET_INACTIVE_B64) { - // Ignore these if V_SET_INACTIVE which already has exec src register. - // These are generated by an earlier pass which has seperately ensured - // WWM and provided a mask of inactive lanes. - Register ExecSrc = TII->findSetInactiveMask(MI); - if (!ExecSrc) { - // Disable strict states; StrictWQM will be added as required later. - III.Disabled = StateStrict; - MachineOperand &Inactive = MI.getOperand(2); - if (Inactive.isReg()) { - if (Inactive.isUndef()) { - LowerToCopyInstrs.insert(&MI); - } else { - markOperand(MI, Inactive, StateStrictWWM, Worklist); - } - } - SetInactiveInstrs.push_back(&MI); - BBI.NeedsLowering = true; + } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) { + // Disable strict states; StrictWQM will be added as required later. + III.Disabled = StateStrict; + MachineOperand &Inactive = MI.getOperand(4); + if (Inactive.isReg()) { + if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0) + LowerToCopyInstrs.insert(&MI); + else + markOperand(MI, Inactive, StateStrictWWM, Worklist); } + SetInactiveInstrs.push_back(&MI); + BBI.NeedsLowering = true; } else if (TII->isDisableWQM(MI)) { BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { @@ -1078,10 +1070,11 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { ActiveLanesReg = 0; break; case AMDGPU::V_SET_INACTIVE_B32: - case AMDGPU::V_SET_INACTIVE_B64: if (ActiveLanesReg) { - MI.addOperand(*MBB.getParent(), - MachineOperand::CreateReg(ActiveLanesReg, false, true)); + LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg()); + MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass()); + MI.getOperand(5).setReg(ActiveLanesReg); + LIS->shrinkToUses(&LI); } else { assert(State == StateExact || State == StateWQM); } @@ -1527,15 +1520,17 @@ bool SIWholeQuadMode::lowerCopyInstrs() { for (MachineInstr *MI : LowerToCopyInstrs) { LLVM_DEBUG(dbgs() << "simplify: " << *MI); - if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || - MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { - assert(MI->getNumExplicitOperands() == 3); + if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) { + assert(MI->getNumExplicitOperands() == 6); LiveInterval *RecomputeLI = nullptr; - if (MI->getOperand(2).isReg()) - RecomputeLI = &LIS->getInterval(MI->getOperand(2).getReg()); + if (MI->getOperand(4).isReg()) + RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg()); - MI->removeOperand(2); + MI->removeOperand(5); + MI->removeOperand(4); + MI->removeOperand(3); + MI->removeOperand(1); if (RecomputeLI) LIS->shrinkToUses(RecomputeLI); @@ -1547,12 +1542,6 @@ bool SIWholeQuadMode::lowerCopyInstrs() { ? (unsigned)AMDGPU::COPY : TII->getMovOpcode(TRI->getRegClassForOperandReg( *MRI, MI->getOperand(0))); - int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - while (Index >= 0) { - MI->removeOperand(Index); - Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - } - MI->setDesc(TII->get(CopyOp)); LLVM_DEBUG(dbgs() << " -> " << *MI); } diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 1205ad4..082546c4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -139,16 +139,21 @@ #include "NVPTX.h" #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include <numeric> #include <queue> @@ -217,7 +222,8 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args", // pointer in parameter AS. // For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to // generic using cvta.param. -static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { +static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam, + bool IsGridConstant) { Instruction *I = dyn_cast<Instruction>(OldUse->getUser()); assert(I && "OldUse must be in an instruction"); struct IP { @@ -228,7 +234,8 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}}; SmallVector<Instruction *> InstructionsToDelete; - auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * { + auto CloneInstInParamAS = [HasCvtaParam, + IsGridConstant](const IP &I) -> Value * { if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) { LI->setOperand(0, I.NewParam); return LI; @@ -252,8 +259,25 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { // Just pass through the argument, the old ASC is no longer needed. return I.NewParam; } + if (auto *MI = dyn_cast<MemTransferInst>(I.OldInstruction)) { + if (MI->getRawSource() == I.OldUse->get()) { + // convert to memcpy/memmove from param space. + IRBuilder<> Builder(I.OldInstruction); + Intrinsic::ID ID = MI->getIntrinsicID(); + + CallInst *B = Builder.CreateMemTransferInst( + ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam, + MI->getSourceAlign(), MI->getLength(), MI->isVolatile()); + for (unsigned I : {0, 1}) + if (uint64_t Bytes = MI->getParamDereferenceableBytes(I)) + B->addDereferenceableParamAttr(I, Bytes); + return B; + } + // We may be able to handle other cases if the argument is + // __grid_constant__ + } - if (GridConstant) { + if (HasCvtaParam) { auto GetParamAddrCastToGeneric = [](Value *Addr, Instruction *OriginalUser) -> Value * { PointerType *ReturnTy = @@ -269,24 +293,44 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { OriginalUser->getIterator()); return CvtToGenCall; }; - - if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) { - I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI)); - return CI; + auto *ParamInGenericAS = + GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction); + + // phi/select could use generic arg pointers w/o __grid_constant__ + if (auto *PHI = dyn_cast<PHINode>(I.OldInstruction)) { + for (auto [Idx, V] : enumerate(PHI->incoming_values())) { + if (V.get() == I.OldUse->get()) + PHI->setIncomingValue(Idx, ParamInGenericAS); + } } - if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) { - // byval address is being stored, cast it to generic - if (SI->getValueOperand() == I.OldUse->get()) - SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI)); - return SI; + if (auto *SI = dyn_cast<SelectInst>(I.OldInstruction)) { + if (SI->getTrueValue() == I.OldUse->get()) + SI->setTrueValue(ParamInGenericAS); + if (SI->getFalseValue() == I.OldUse->get()) + SI->setFalseValue(ParamInGenericAS); } - if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) { - if (PI->getPointerOperand() == I.OldUse->get()) - PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI)); - return PI; + + // Escapes or writes can only use generic param pointers if + // __grid_constant__ is in effect. + if (IsGridConstant) { + if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) { + I.OldUse->set(ParamInGenericAS); + return CI; + } + if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) { + // byval address is being stored, cast it to generic + if (SI->getValueOperand() == I.OldUse->get()) + SI->setOperand(0, ParamInGenericAS); + return SI; + } + if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) { + if (PI->getPointerOperand() == I.OldUse->get()) + PI->setOperand(0, ParamInGenericAS); + return PI; + } + // TODO: iIf we allow stores, we should allow memcpy/memset to + // parameter, too. } - llvm_unreachable( - "Instruction unsupported even for grid_constant argument"); } llvm_unreachable("Unsupported instruction"); @@ -409,49 +453,110 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, } } +namespace { +struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> { + using Base = PtrUseVisitor<ArgUseChecker>; + + bool IsGridConstant; + // Set of phi/select instructions using the Arg + SmallPtrSet<Instruction *, 4> Conditionals; + + ArgUseChecker(const DataLayout &DL, bool IsGridConstant) + : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {} + + PtrInfo visitArgPtr(Argument &A) { + assert(A.getType()->isPointerTy()); + IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType())); + IsOffsetKnown = false; + Offset = APInt(IntIdxTy->getBitWidth(), 0); + PI.reset(); + Conditionals.clear(); + + LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n"); + // Enqueue the uses of this pointer. + enqueueUsers(A); + + // Visit all the uses off the worklist until it is empty. + // Note that unlike PtrUseVisitor we intentionally do not track offsets. + // We're only interested in how we use the pointer. + while (!(Worklist.empty() || PI.isAborted())) { + UseToVisit ToVisit = Worklist.pop_back_val(); + U = ToVisit.UseAndIsOffsetKnown.getPointer(); + Instruction *I = cast<Instruction>(U->getUser()); + if (isa<PHINode>(I) || isa<SelectInst>(I)) + Conditionals.insert(I); + LLVM_DEBUG(dbgs() << "Processing " << *I << "\n"); + Base::visit(I); + } + if (PI.isEscaped()) + LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst() + << "\n"); + else if (PI.isAborted()) + LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst() + << "\n"); + LLVM_DEBUG(dbgs() << "Traversed " << Conditionals.size() + << " conditionals\n"); + return PI; + } + + void visitStoreInst(StoreInst &SI) { + // Storing the pointer escapes it. + if (U->get() == SI.getValueOperand()) + return PI.setEscapedAndAborted(&SI); + // Writes to the pointer are UB w/ __grid_constant__, but do not force a + // copy. + if (!IsGridConstant) + return PI.setAborted(&SI); + } + + void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { + // ASC to param space are no-ops and do not need a copy + if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM) + return PI.setEscapedAndAborted(&ASC); + Base::visitAddrSpaceCastInst(ASC); + } + + void visitPtrToIntInst(PtrToIntInst &I) { + if (IsGridConstant) + return; + Base::visitPtrToIntInst(I); + } + void visitPHINodeOrSelectInst(Instruction &I) { + assert(isa<PHINode>(I) || isa<SelectInst>(I)); + } + // PHI and select just pass through the pointers. + void visitPHINode(PHINode &PN) { enqueueUsers(PN); } + void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); } + + void visitMemTransferInst(MemTransferInst &II) { + if (*U == II.getRawDest() && !IsGridConstant) + PI.setAborted(&II); + // memcpy/memmove are OK when the pointer is source. We can convert them to + // AS-specific memcpy. + } + + void visitMemSetInst(MemSetInst &II) { + if (!IsGridConstant) + PI.setAborted(&II); + } +}; // struct ArgUseChecker +} // namespace + void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { - bool IsGridConstant = isParamGridConstant(*Arg); Function *Func = Arg->getParent(); + bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam(); + bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg); + const DataLayout &DL = Func->getDataLayout(); BasicBlock::iterator FirstInst = Func->getEntryBlock().begin(); Type *StructType = Arg->getParamByValType(); assert(StructType && "Missing byval type"); - auto AreSupportedUsers = [&](Value *Start) { - SmallVector<Value *, 16> ValuesToCheck = {Start}; - auto IsSupportedUse = [IsGridConstant](Value *V) -> bool { - if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V)) - return true; - // ASC to param space are OK, too -- we'll just strip them. - if (auto *ASC = dyn_cast<AddrSpaceCastInst>(V)) { - if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM) - return true; - } - // Simple calls and stores are supported for grid_constants - // writes to these pointers are undefined behaviour - if (IsGridConstant && - (isa<CallInst>(V) || isa<StoreInst>(V) || isa<PtrToIntInst>(V))) - return true; - return false; - }; - - while (!ValuesToCheck.empty()) { - Value *V = ValuesToCheck.pop_back_val(); - if (!IsSupportedUse(V)) { - LLVM_DEBUG(dbgs() << "Need a " - << (isParamGridConstant(*Arg) ? "cast " : "copy ") - << "of " << *Arg << " because of " << *V << "\n"); - (void)Arg; - return false; - } - if (!isa<LoadInst>(V) && !isa<CallInst>(V) && !isa<StoreInst>(V) && - !isa<PtrToIntInst>(V)) - llvm::append_range(ValuesToCheck, V->users()); - } - return true; - }; - - if (llvm::all_of(Arg->users(), AreSupportedUsers)) { + ArgUseChecker AUC(DL, IsGridConstant); + ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg); + bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted()); + // Easy case, accessing parameter directly is fine. + if (ArgUseIsReadOnly && AUC.Conditionals.empty()) { // Convert all loads and intermediate operations to use parameter AS and // skip creation of a local copy of the argument. SmallVector<Use *, 16> UsesToUpdate; @@ -462,7 +567,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), FirstInst); for (Use *U : UsesToUpdate) - convertToParamAS(U, ArgInParamAS, IsGridConstant); + convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant); LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n"); const auto *TLI = @@ -473,13 +578,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, return; } - const DataLayout &DL = Func->getDataLayout(); + // We can't access byval arg directly and need a pointer. on sm_70+ we have + // ability to take a pointer to the argument without making a local copy. + // However, we're still not allowed to write to it. If the user specified + // `__grid_constant__` for the argument, we'll consider escaped pointer as + // read-only. unsigned AS = DL.getAllocaAddrSpace(); - if (isParamGridConstant(*Arg)) { - // Writes to a grid constant are undefined behaviour. We do not need a - // temporary copy. When a pointer might have escaped, conservatively replace - // all of its uses (which might include a device function call) with a cast - // to the generic address space. + if (HasCvtaParam && (ArgUseIsReadOnly || IsGridConstant)) { + LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n"); + // Replace all argument pointer uses (which might include a device function + // call) with a cast to the generic address space using cvta.param + // instruction, which avoids a local copy. IRBuilder<> IRB(&Func->getEntryBlock().front()); // Cast argument to param address space @@ -500,6 +609,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, // Do not replace Arg in the cast to param space CastToParam->setOperand(0, Arg); } else { + LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n"); // Otherwise we have to create a temporary copy. AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 0591782..457f10f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -94,6 +94,7 @@ public: bool hasDotInstructions() const { return SmVersion >= 61 && PTXVersion >= 50; } + bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } // GPUs with "a" suffix have include architecture-accelerated features that diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index d2e6257..4554163 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1106,18 +1106,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (!isTypeLegal(VT)) continue; setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); - setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom); setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT, Custom); + setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom); + setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT, + Custom); + setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR}, VT, Custom); - setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom); if (Subtarget.hasStdExtZfbfmin()) setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); - setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT, - Custom); - setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom); // TODO: Promote to fp32. } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 13212c2..02f65ff 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2918,7 +2918,7 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI, // if any possible. if (MO.getTargetFlags() == RISCVII::MO_PCREL_LO && (MI.getMF()->getTarget().getFunctionSections() || F.hasComdat() || - F.hasSection())) + F.hasSection() || F.getSectionPrefix())) return outliner::InstrType::Illegal; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 5cc084f..1875a8f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -96,6 +96,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setOperationAction(ISD::STORE, T, Custom); } } + if (Subtarget->hasFP16()) { + setOperationAction(ISD::LOAD, MVT::v8f16, Custom); + setOperationAction(ISD::STORE, MVT::v8f16, Custom); + } if (Subtarget->hasReferenceTypes()) { // We need custom load and store lowering for both externref, funcref and // Other. The MVT::Other here represents tables of reference types. @@ -208,6 +212,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( MVT::v2f64}) setOperationAction(ISD::BUILD_VECTOR, T, Custom); + if (Subtarget->hasFP16()) + setOperationAction(ISD::BUILD_VECTOR, MVT::f16, Custom); + // We have custom shuffle lowering to expose the shuffle mask for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64, MVT::v2f64}) @@ -2055,6 +2062,18 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + if (VT == MVT::v8f16) { + // BUILD_VECTOR can't handle FP16 operands since Wasm doesn't have a scaler + // FP16 type, so cast them to I16s. + MVT IVT = VT.changeVectorElementType(MVT::i16); + SmallVector<SDValue, 8> NewOps; + for (unsigned I = 0, E = Op.getNumOperands(); I < E; ++I) + NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I))); + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps); + return DAG.getBitcast(VT, Res); + } + if (auto ConvertLow = LowerConvertLow(Op, DAG)) return ConvertLow; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 9d17d90..9be23da 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -189,7 +189,7 @@ defm LOAD_V128_A64 : } // Def load patterns from WebAssemblyInstrMemory.td for vector types -foreach vec = StdVecs in { +foreach vec = AllVecs in { defm : LoadPat<vec.vt, load, "LOAD_V128">; } @@ -390,7 +390,7 @@ defm STORE_V128_A64 : } // Def store patterns from WebAssemblyInstrMemory.td for vector types -foreach vec = StdVecs in { +foreach vec = AllVecs in { defm : StorePat<vec.vt, store, "STORE_V128">; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b821da0..3b6b154 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1182,13 +1182,23 @@ public: InstructionCost Cost) { assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. - /// But the cost will be assigned to one instruction only. + /// When interleaving, the cost will only be assigned one instruction, the + /// insert position. For other cases, add the appropriate fraction of the + /// total cost to each instruction. This ensures accurate costs are used, + /// even if the insert position instruction is not used. + InstructionCost InsertPosCost = Cost; + InstructionCost OtherMemberCost = 0; + if (W != CM_Interleave) + OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers(); + ; for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) { if (auto *I = Grp->getMember(Idx)) { if (Grp->getInsertPos() == I) - WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + WideningDecisions[std::make_pair(I, VF)] = + std::make_pair(W, InsertPosCost); else - WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); + WideningDecisions[std::make_pair(I, VF)] = + std::make_pair(W, OtherMemberCost); } } } @@ -8663,6 +8673,7 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock( !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) || isa<VPWidenPointerInductionRecipe>(V) || (isa<Instruction>(IncomingValue) && + OrigLoop->contains(cast<Instruction>(IncomingValue)) && any_of(IncomingValue->users(), [&Inductions](User *U) { auto *P = dyn_cast<PHINode>(U); return P && Inductions.contains(P); diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index e9be6f5..c5d2ebf 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -28,24 +28,24 @@ PreservedAnalyses SandboxVectorizerPass::run(Function &F, return PA; } -bool SandboxVectorizerPass::runImpl(Function &F) { +bool SandboxVectorizerPass::runImpl(Function &LLVMF) { // If the target claims to have no vector registers early return. if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) { LLVM_DEBUG(dbgs() << "SBVec: Target has no vector registers, return.\n"); return false; } - LLVM_DEBUG(dbgs() << "SBVec: Analyzing " << F.getName() << ".\n"); + LLVM_DEBUG(dbgs() << "SBVec: Analyzing " << LLVMF.getName() << ".\n"); // Early return if the attribute NoImplicitFloat is used. - if (F.hasFnAttribute(Attribute::NoImplicitFloat)) { + if (LLVMF.hasFnAttribute(Attribute::NoImplicitFloat)) { LLVM_DEBUG(dbgs() << "SBVec: NoImplicitFloat attribute, return.\n"); return false; } - sandboxir::Context Ctx(F.getContext()); - // Create SandboxIR for `F`. - sandboxir::Function &SBF = *Ctx.createFunction(&F); + sandboxir::Context Ctx(LLVMF.getContext()); + // Create SandboxIR for `LLVMF`. + sandboxir::Function &F = *Ctx.createFunction(&LLVMF); // TODO: Initialize SBVec Pass Manager - (void)SBF; + (void)F; return false; } diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll index 29dce5f..57ae02a 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll @@ -82,26 +82,26 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 75 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 150 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -139,11 +139,11 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll index 0e7b1c5..2cc5150 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll @@ -108,34 +108,34 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 105 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -180,13 +180,13 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll index 8830aff..1899741 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll @@ -121,22 +121,22 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 @@ -145,14 +145,14 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -203,14 +203,14 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 -; AVX512: LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll index cfd3d78..2d4b300 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll @@ -71,10 +71,10 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll index 5ec5b51..5dfb25e 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll @@ -88,12 +88,12 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll index 450743d..bd88ca8 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll @@ -107,14 +107,14 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll index 5e5c718..9c079863 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll @@ -72,21 +72,21 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -114,16 +114,16 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll index 62541fa2..99a735d 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll @@ -133,18 +133,18 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll index cfed855..168e916 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll @@ -94,27 +94,27 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -152,20 +152,20 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll index 07939b9..919a17e 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll @@ -105,30 +105,30 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -163,22 +163,22 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll index 964a9b6..6737c72 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll @@ -93,31 +93,31 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 330 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll index 6653198..46d56a7 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll @@ -123,41 +123,41 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 231 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 462 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll index b3a5cbe..4d65abd 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll @@ -138,46 +138,46 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX2: LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX2: LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX2: LV: Found an estimated cost of 528 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll index c0ea210..28a6443 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll @@ -73,8 +73,8 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX512: LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll index 2a261ca..5cad7bf 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll @@ -82,26 +82,26 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -139,11 +139,11 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll index 8bf3071..cfb83d4 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll @@ -108,34 +108,34 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -180,13 +180,13 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll index 3182de2..7757854 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll @@ -121,22 +121,22 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 @@ -145,14 +145,14 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -203,14 +203,14 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 -; AVX512: LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll index 27e2ee0..cf350cc 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll @@ -71,10 +71,10 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll index c377232..9ca0d8c 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll @@ -88,12 +88,12 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll index 2eb7c5e..86ee6c8 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll @@ -107,14 +107,14 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll index c11da43..f6143d4 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll @@ -72,21 +72,21 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -114,16 +114,16 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll index de57af6..43dc53d 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll @@ -133,18 +133,18 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll index 949c1af..70ed74dc 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll @@ -94,27 +94,27 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 126 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -152,20 +152,20 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll index 4388ccf..401e4de 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll @@ -105,30 +105,30 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 144 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -163,22 +163,22 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll index 6078fb4..ef3c80c 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll @@ -93,31 +93,31 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 325 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll index 778a4e7..8e7c316 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll @@ -123,41 +123,41 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 224 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 455 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll index a230b5a..752cc22 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll @@ -138,46 +138,46 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 -; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1 -; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1 -; AVX2: LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1 -; AVX2: LV: Found an estimated cost of 256 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1 -; AVX2: LV: Found an estimated cost of 520 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll index c1a66c1..ed2bb3f 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll @@ -163,22 +163,22 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 46 for VF 4 For instruction: store double %v7, ptr %out7, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: store double %v7, ptr %out7, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v0, ptr %out0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v1, ptr %out1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v3, ptr %out3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v4, ptr %out4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v5, ptr %out5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: store double %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v7, ptr %out7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll index 7be9577..a539877 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll @@ -163,22 +163,22 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 46 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll index 13a8442..41dd266 100644 --- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll @@ -38,8 +38,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 12 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 @@ -99,8 +99,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 10 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 9c634ab..89c3bbc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -85,7 +85,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec @@ -159,7 +160,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec @@ -233,7 +235,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec @@ -300,7 +303,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY6]], [[DEF1]], implicit $exec ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index fdce9d9..8eef3d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -77,7 +77,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec @@ -170,7 +171,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec @@ -255,7 +257,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY6]], [[DEF1]], implicit $exec ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 137366a..f015099 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,15 +4,13 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -45,17 +43,15 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -89,22 +85,18 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s6, s[4:7], 0x0 -; GCN-NEXT: s_load_dword s7, s[2:3], 0x2c +; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[2:3] +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 56 -; GCN-NEXT: s_cselect_b32 s3, 1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_cmp_lg_u32 s4, 56 ; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: s_cmp_lg_u32 s3, 0 +; GCN-NEXT: s_mov_b32 s2, 1 ; GCN-NEXT: s_cbranch_scc0 .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1 @@ -145,15 +137,16 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -174,13 +167,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-NEXT: v_mov_b32_e32 v1, 0x4010cccc ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -193,15 +188,16 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -216,15 +212,16 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -240,19 +237,15 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s6, 1 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v1, 1, v3, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -267,19 +260,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s6, 1.0 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -293,15 +282,16 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -318,21 +308,20 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s6, 0x10001 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0 %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0) @@ -345,21 +334,20 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s6, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0 %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0) @@ -372,21 +360,20 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s6, 0x3f803f80 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0 %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0) @@ -398,17 +385,15 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -422,15 +407,13 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -445,15 +428,13 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -468,15 +449,13 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -491,15 +470,13 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 5a8df7b8..b17dfc7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2890,68 +2890,65 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-LABEL: add_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2 @@ -2970,8 +2967,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,68 +2981,65 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-LABEL: add_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2 @@ -3064,8 +3058,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s1, v7 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3077,23 +3071,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_DPP-LABEL: add_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 @@ -3184,23 +3176,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_DPP-LABEL: add_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 @@ -3275,31 +3265,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -3388,53 +3377,53 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 @@ -3478,31 +3467,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -3594,29 +3582,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -6611,68 +6600,65 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-LABEL: sub_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 @@ -6691,8 +6677,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6705,68 +6691,65 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-LABEL: sub_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 @@ -6785,8 +6768,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s1, v7 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6798,23 +6781,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_DPP-LABEL: sub_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 @@ -6905,23 +6886,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_DPP-LABEL: sub_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 @@ -6996,31 +6975,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -7109,53 +7087,53 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 @@ -7199,31 +7177,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -7315,29 +7292,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 6bf03a2..988bc8e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -2361,84 +2361,82 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: add_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_add_rtn_u64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB6_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -2450,83 +2448,81 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: add_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_add_rtn_u64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB6_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -2538,23 +2534,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: add_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 @@ -2605,57 +2599,56 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB6_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v9 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v10, vcc +; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v11 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v12, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 @@ -2685,33 +2678,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB6_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v9 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo +; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v11 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm @@ -2722,31 +2716,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -2790,34 +2783,35 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 -; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[8:9], v0, v[8:9] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB6_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s0, v8 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v9, vcc +; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s0, v10 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v11, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1164_DPP-NEXT: s_nop 0 @@ -2830,77 +2824,77 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 -; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 +; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[8:9], v0, v[8:9] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB6_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s0, v8 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s0, v10 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1132_DPP-NEXT: s_nop 0 @@ -3163,164 +3157,158 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX8_DPP-LABEL: add_i64_varying_nouse: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v8 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s0 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_add_u64 v8, v[9:10] +; GFX8_DPP-NEXT: ds_add_u64 v7, v[8:9] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB7_2: ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: add_i64_varying_nouse: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v8 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v1, vcc, v3, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX9_DPP-NEXT: ds_add_u64 v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX9_DPP-NEXT: ds_add_u64 v7, v[8:9] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB7_2: ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: add_i64_varying_nouse: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 @@ -3345,16 +3333,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 32 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_add_u32 s0, s3, s4 ; GFX1064_DPP-NEXT: s_addc_u32 s1, s2, s5 -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_add_u64 v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_add_u64 v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB7_2: @@ -3362,25 +3351,21 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX1032_DPP-LABEL: add_i64_varying_nouse: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 @@ -3398,14 +3383,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v11, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: ds_add_u64 v10, v[11:12] +; GFX1032_DPP-NEXT: ds_add_u64 v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB7_2: @@ -3417,30 +3403,29 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v6, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc @@ -3460,15 +3445,16 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: ds_add_u64 v7, v[8:9] +; GFX1164_DPP-NEXT: ds_add_u64 v0, v[6:7] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB7_2: @@ -3480,49 +3466,49 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s0 -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v3, 0, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v7, v3 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: ds_add_u64 v7, v[8:9] +; GFX1132_DPP-NEXT: ds_add_u64 v0, v[6:7] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB7_2: @@ -5909,84 +5895,82 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: sub_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB14_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -5998,83 +5982,81 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: sub_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB14_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s5, v7 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -6086,23 +6068,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: sub_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 @@ -6153,57 +6133,56 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB14_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v9 -; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v10, vcc +; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v11 +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v12, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 @@ -6233,33 +6212,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB14_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v9 -; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo +; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v11 +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm @@ -6270,31 +6250,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -6338,34 +6317,35 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 -; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[8:9], v0, v[8:9] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB14_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s0, v8 -; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v9, vcc +; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s0, v10 +; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v11, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1164_DPP-NEXT: s_nop 0 @@ -6378,77 +6358,77 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 -; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 +; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[8:9], v0, v[8:9] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB14_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s0, v8 -; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo +; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s0, v10 +; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1132_DPP-NEXT: s_nop 0 @@ -7432,129 +7412,121 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: and_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b32 s6, -1 +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB16_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX8_DPP-NEXT: s_mov_b32 s2, -1 -; GFX8_DPP-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX8_DPP-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v6 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX8_DPP-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX8_DPP-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[4:7], 0 ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: and_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX9_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB16_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s2, -1 -; GFX9_DPP-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX9_DPP-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v6 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_DPP-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX9_DPP-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[4:7], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: and_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1] -; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7575,63 +7547,62 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB16_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1064_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s1, v8 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: and_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4 -; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7643,199 +7614,201 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB16_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1032_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s1, v8 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: and_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1] -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB16_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s1, v8 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: and_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4 -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 -; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB16_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1132_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s1, v8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -8816,54 +8789,52 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: or_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB18_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_or_b32_e32 v6, s4, v6 @@ -8874,53 +8845,51 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: or_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX9_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB18_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_or_b32_e32 v6, s4, v6 @@ -8931,14 +8900,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: or_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] -; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8959,63 +8926,62 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB18_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1064_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: v_or_b32_e32 v9, s0, v9 +; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s1, v8 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: or_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 -; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9027,199 +8993,201 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB18_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1032_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: v_or_b32_e32 v9, s0, v9 +; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s1, v8 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: or_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB18_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1164_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: v_or_b32_e32 v9, s0, v9 +; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s1, v8 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: or_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 -; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB18_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1132_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: v_or_b32_e32 v9, s0, v9 +; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s1, v8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -10200,54 +10168,52 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: xor_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB20_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 @@ -10258,53 +10224,51 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: xor_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB20_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 @@ -10315,14 +10279,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: xor_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] -; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10343,63 +10305,62 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB20_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: xor_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 -; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10411,199 +10372,201 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB20_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: xor_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB20_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1164_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: xor_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 -; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB20_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1132_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -10934,11 +10897,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_mov_b64 exec, -1 -; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -10982,11 +10942,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_mov_b64 exec, -1 -; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11885,20 +11842,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: max_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: s_mov_b32 s0, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX8_DPP-NEXT: s_brev_b32 s1, 1 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX8_DPP-NEXT: s_mov_b64 exec, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 @@ -11953,22 +11906,22 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB23_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -11984,20 +11937,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: max_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: s_mov_b32 s0, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX9_DPP-NEXT: s_brev_b32 s1, 1 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX9_DPP-NEXT: s_mov_b64 exec, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 @@ -12052,21 +12001,21 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB23_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -12081,15 +12030,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: max_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: s_mov_b32 s4, 0 ; GFX1064_DPP-NEXT: s_brev_b32 s5, 1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s5 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s4, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s5, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12156,30 +12103,31 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB23_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10] @@ -12191,15 +12139,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: max_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: s_mov_b32 s0, 0 ; GFX1032_DPP-NEXT: s_brev_b32 s1, 1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s0, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s1, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12244,29 +12190,30 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB23_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10] @@ -12278,79 +12225,79 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: max_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: s_mov_b32 s0, 0 -; GFX1164_DPP-NEXT: s_brev_b32 s1, 1 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s[4:5] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s[4:5] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_mov_b32 s4, 0 +; GFX1164_DPP-NEXT: s_brev_b32 s5, 1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s5 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_readlane_b32 s0, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s1, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -12369,29 +12316,30 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB23_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12406,55 +12354,54 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_DPP-LABEL: max_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, 0 ; GFX1132_DPP-NEXT: s_brev_b32 s1, 1 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -12462,27 +12409,28 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 -; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB23_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12821,11 +12769,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_mov_b64 exec, -1 -; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -12869,11 +12814,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_mov_b64 exec, -1 -; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13771,21 +13713,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: min_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: s_mov_b32 s6, -1 ; GFX8_DPP-NEXT: s_brev_b32 s7, -2 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX8_DPP-NEXT: s_mov_b64 exec, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 @@ -13839,22 +13777,22 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB26_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s1 @@ -13868,21 +13806,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: min_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: s_mov_b32 s6, -1 ; GFX9_DPP-NEXT: s_brev_b32 s7, -2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 -; GFX9_DPP-NEXT: s_mov_b64 exec, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 @@ -13936,21 +13870,21 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB26_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s1 @@ -13964,15 +13898,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: min_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14039,29 +13971,30 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s11, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB26_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10] @@ -14073,15 +14006,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: min_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14126,28 +14057,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB26_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10] @@ -14159,79 +14091,79 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: min_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1164_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s7 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -14250,29 +14182,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s10, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s11, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB26_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -14287,82 +14219,82 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_DPP-LABEL: min_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_brev_b32 s7, -2 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_brev_b32 s7, -2 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 -; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB26_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -15634,85 +15566,84 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: umax_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_max_rtn_u64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB29_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc @@ -15726,84 +15657,83 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: umax_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_max_rtn_u64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB29_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc @@ -15817,18 +15747,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: umax_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -15892,30 +15818,31 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB29_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10] @@ -15927,13 +15854,11 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umax_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 @@ -15978,29 +15903,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB29_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10] @@ -16016,73 +15942,73 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -16101,29 +16027,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB29_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -16142,49 +16069,49 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -16192,27 +16119,28 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 -; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB29_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -17485,201 +17413,193 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: umin_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_mov_b32 s6, -1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_min_rtn_u64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB32_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s1 ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: umin_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_min_rtn_u64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB32_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s1 ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: umin_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -17743,30 +17663,31 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB32_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10] @@ -17778,13 +17699,11 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umin_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 @@ -17829,29 +17748,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB32_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10] @@ -17867,73 +17787,73 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -17952,29 +17872,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB32_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -17993,49 +17914,49 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -18043,27 +17964,28 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 -; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB32_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index 6bd0b11..d62ff37 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -59,7 +59,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec ; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec ; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX908-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -120,7 +121,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec ; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -177,7 +179,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX11_GFX12-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX11_GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec @@ -186,8 +189,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX11_GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX11_GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GFX11_GFX12-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11_GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]] ; GFX11_GFX12-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY5]], 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 6766c0c..946ee9e 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -61,7 +61,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -80,7 +81,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} @@ -102,11 +103,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: bb.4 (%ir-block.35): ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX90A-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec @@ -142,7 +143,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -161,7 +163,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX940-NEXT: S_BRANCH %bb.2 ; GFX940-NEXT: {{ $}} @@ -183,11 +185,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: bb.4 (%ir-block.35): ; GFX940-NEXT: successors: %bb.3(0x80000000) ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX940-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec ; GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec @@ -219,7 +221,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -229,8 +232,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]] + ; GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[COPY5]], 0, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec @@ -244,7 +247,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.2 ; GFX11-NEXT: {{ $}} @@ -266,11 +269,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: bb.4 (%ir-block.32): ; GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF3]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX11-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec - ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: early-clobber %47:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %47, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY6]] ; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY8]], implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 311c609..2b18f47 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -816,10 +816,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2027,10 +2024,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3298,10 +3292,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4065,10 +4056,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -5361,10 +5349,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7330,44 +7315,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -7463,9 +7449,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -7585,9 +7571,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -7691,12 +7677,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -7813,42 +7799,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -8907,44 +8893,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -9008,9 +8995,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9096,9 +9083,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9168,12 +9155,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -9259,42 +9246,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -10330,44 +10317,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -10431,9 +10419,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10519,9 +10507,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10591,12 +10579,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -10682,42 +10670,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -11235,44 +11223,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -11336,9 +11325,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11424,9 +11413,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11496,12 +11485,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -11587,42 +11576,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -13305,44 +13294,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -13438,9 +13428,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13560,9 +13550,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13666,12 +13656,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -13788,42 +13778,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 9dc82b1..e3144ae 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -718,10 +718,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -1756,10 +1753,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2794,10 +2788,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4742,51 +4733,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -4884,9 +4876,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5016,9 +5008,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5130,12 +5122,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -5264,51 +5256,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -6188,51 +6179,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -6298,9 +6290,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6381,9 +6373,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6446,12 +6438,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -6550,51 +6542,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -8238,51 +8229,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -8380,9 +8372,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8512,9 +8504,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8626,12 +8618,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -8760,51 +8752,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 945583c..ddc1031 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -718,10 +718,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -1756,10 +1753,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2794,10 +2788,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4742,51 +4733,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -4884,9 +4876,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5016,9 +5008,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5130,12 +5122,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -5264,51 +5256,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -6188,51 +6179,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] -; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -6298,9 +6290,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6381,9 +6373,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6446,12 +6438,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -6550,51 +6542,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -8238,51 +8229,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -8380,9 +8372,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8512,9 +8504,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8626,12 +8618,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -8760,51 +8752,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 3bc0f25..f353edf 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -894,10 +894,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2217,10 +2214,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3540,10 +3534,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4359,10 +4350,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -5681,10 +5669,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7650,44 +7635,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -7783,9 +7769,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -7905,9 +7891,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8011,12 +7997,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -8133,42 +8119,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -9226,44 +9212,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -9327,9 +9314,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9415,9 +9402,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9487,12 +9474,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -9578,42 +9565,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -10649,44 +10636,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -10750,9 +10738,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10838,9 +10826,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10910,12 +10898,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -11001,42 +10989,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -11554,44 +11542,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -11655,9 +11644,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11743,9 +11732,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11815,12 +11804,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -11906,42 +11895,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -13623,44 +13612,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -13756,9 +13746,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13878,9 +13868,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13984,12 +13974,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -14106,42 +14096,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll index c1b58f1..fbf8c203 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll @@ -93,18 +93,18 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; DAGISEL11: ; %bb.0: ; DAGISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 -; DAGISEL11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10 +; DAGISEL11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v10 ; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 ; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; DAGISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 -; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; DAGISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; DAGISEL11-NEXT: v_cndmask_b32_e64 v2, v0, v13, s0 +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v12, s0 ; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 -; DAGISEL11-NEXT: v_mov_b32_e32 v2, v0 -; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; DAGISEL11-NEXT: v_mov_b32_e32 v3, v1 -; DAGISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; DAGISEL11-NEXT: v_mov_b32_e32 v4, v2 +; DAGISEL11-NEXT: global_store_b64 v[8:9], v[3:4], off ; DAGISEL11-NEXT: s_endpgm ; ; GISEL10-LABEL: set_inactive_chain_arg_64: @@ -127,16 +127,16 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; DAGISEL10: ; %bb.0: ; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 -; DAGISEL10-NEXT: v_mov_b32_e32 v1, v11 -; DAGISEL10-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL10-NEXT: v_mov_b32_e32 v0, v11 +; DAGISEL10-NEXT: v_mov_b32_e32 v1, v10 ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 -; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 -; DAGISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v2, v0, v13, s0 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v12, s0 ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 -; DAGISEL10-NEXT: v_mov_b32_e32 v2, v0 ; DAGISEL10-NEXT: v_mov_b32_e32 v3, v1 -; DAGISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; DAGISEL10-NEXT: v_mov_b32_e32 v4, v2 +; DAGISEL10-NEXT: global_store_dwordx2 v[8:9], v[3:4], off ; DAGISEL10-NEXT: s_endpgm ; ; GISEL11_W64-LABEL: set_inactive_chain_arg_64: @@ -162,18 +162,19 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; DAGISEL11_W64: ; %bb.0: ; DAGISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, v11 -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v11 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, v10 ; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 -; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] -; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v2, v0, v13, s[0:1] +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[0:1] ; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] -; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v2, v0 ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v3, v1 -; DAGISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v4, v2 +; DAGISEL11_W64-NEXT: global_store_b64 v[8:9], v[3:4], off ; DAGISEL11_W64-NEXT: s_endpgm ; ; GISEL10_W64-LABEL: set_inactive_chain_arg_64: @@ -196,16 +197,16 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; DAGISEL10_W64: ; %bb.0: ; DAGISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, v11 -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v11 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, v10 ; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 -; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] -; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v2, v0, v13, s[0:1] +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[0:1] ; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v2, v0 ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v3, v1 -; DAGISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v4, v2 +; DAGISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[3:4], off ; DAGISEL10_W64-NEXT: s_endpgm %tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0 %wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 6dc4a2c..6fb5a9c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,16 +5,14 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -46,19 +44,19 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -96,11 +94,9 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, s8 ; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_mov_b64 exec, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[2:3] ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, 56 @@ -145,17 +141,15 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s7, 0x40400000 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -170,21 +164,21 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s0, 0xcccccccd -; GCN-NEXT: s_mov_b32 s1, 0x4010cccc -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x4010cccc +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0xcccccccd +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -198,17 +192,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s7, 0x10001 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -222,17 +214,15 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s7, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -247,21 +237,19 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 1 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 1, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -276,21 +264,19 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 1.0 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -304,17 +290,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s7, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -329,21 +313,20 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 0x10001 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -358,21 +341,20 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -387,21 +369,20 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 0x3f803f80 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -416,19 +397,19 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -442,16 +423,14 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -465,16 +444,14 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -488,16 +465,14 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -511,16 +486,14 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir index 8a5f753..b0fb24e 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir @@ -111,7 +111,7 @@ body: | ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5 ; GCN-NEXT: $sgpr22 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2 - ; GCN-NEXT: dead $vgpr1 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc + ; GCN-NEXT: dead $vgpr1 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) @@ -212,7 +212,7 @@ body: | $sgpr22 = IMPLICIT_DEF SI_SPILL_S32_SAVE $sgpr22, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) - %0:vgpr_32 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc + %0:vgpr_32 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc bb.1: KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index ff692ac..92117e0 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1708,8 +1708,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s7, s0 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 @@ -1722,8 +1722,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s7, s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1] ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir index 64a7c445..3013aab 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -40,9 +40,6 @@ define amdgpu_vs void @no_wqm_in_vs() { ret void } - define amdgpu_ps void @preloaded_set_inactive() { - ret void - } ... --- @@ -155,7 +152,7 @@ registers: - { id: 9, class: sreg_32, preferred-register: '' } - { id: 10, class: vgpr_32, preferred-register: '' } - { id: 11, class: vgpr_32, preferred-register: '' } - - { id: 12, class: sreg_32, preferred-register: '' } + - { id: 12, class: vgpr_32, preferred-register: '' } - { id: 13, class: vgpr_32, preferred-register: '' } - { id: 14, class: vgpr_32, preferred-register: '' } - { id: 15, class: vgpr_32, preferred-register: '' } @@ -179,7 +176,8 @@ body: | %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, implicit $exec %16:vgpr_32 = COPY %8.sub1 %11:vgpr_32 = COPY %16 - %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec, implicit-def $scc + %17:sreg_64_xexec = IMPLICIT_DEF + %10:vgpr_32 = V_SET_INACTIVE_B32 0, %11, 0, undef %12, undef %17, implicit $exec, implicit-def $scc %14:vgpr_32 = COPY %7 %13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec early-clobber %15:vgpr_32 = STRICT_WWM killed %13, implicit $exec @@ -298,8 +296,9 @@ body: | %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:vgpr_32 = COPY $vgpr0 %2:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN %1:vgpr_32, %0:sgpr_128, 0, 0, 0, 0, implicit $exec - %2.sub0:vreg_64 = V_SET_INACTIVE_B32 %2.sub0:vreg_64, 0, implicit $exec, implicit-def $scc - %2.sub1:vreg_64 = V_SET_INACTIVE_B32 %2.sub1:vreg_64, 0, implicit $exec, implicit-def $scc + %4:sreg_64_xexec = IMPLICIT_DEF + %2.sub0:vreg_64 = V_SET_INACTIVE_B32 0, %2.sub0:vreg_64, 0, 0, undef %4, implicit $exec, implicit-def $scc + %2.sub1:vreg_64 = V_SET_INACTIVE_B32 0, %2.sub1:vreg_64, 0, 0, undef %4, implicit $exec, implicit-def $scc %3:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec $vgpr0 = STRICT_WWM %3.sub0:vreg_64, implicit $exec $vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec @@ -446,19 +445,3 @@ body: | %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) ... - ---- -# Preserve V_SET_INACTIVE with exec mask already specified -#CHECK-LABEL: name: preloaded_set_inactive -#CHECK: V_SET_INACTIVE_B32 -name: preloaded_set_inactive -tracksRegLiveness: true -body: | - bb.0: - liveins: $vgpr1, $vgpr2 - - %0:vgpr_32 = COPY $vgpr1 - %1:vgpr_32 = COPY $vgpr2 - %mask:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - %value:vgpr_32 = V_SET_INACTIVE_B32 %0:vgpr_32, %1:vgpr_32, implicit $exec, implicit-def $scc, implicit %mask:sreg_64 -... diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 47e1897..b35ef64 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -30,15 +30,15 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -170,11 +170,10 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -201,6 +200,9 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[34:35] ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -352,32 +354,32 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 -; GFX9-O0-NEXT: s_mov_b32 s42, s6 +; GFX9-O0-NEXT: s_mov_b32 s40, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 -; GFX9-O0-NEXT: s_mov_b32 s43, s7 -; GFX9-O0-NEXT: s_mov_b32 s44, s43 -; GFX9-O0-NEXT: s_mov_b32 s45, s42 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_mov_b32 s41, s7 +; GFX9-O0-NEXT: s_mov_b32 s42, s41 +; GFX9-O0-NEXT: s_mov_b32 s43, s40 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s5 -; GFX9-O0-NEXT: s_mov_b32 s46, s35 +; GFX9-O0-NEXT: s_mov_b32 s44, s35 ; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s46 -; GFX9-O0-NEXT: s_mov_b32 s38, s45 -; GFX9-O0-NEXT: s_mov_b32 s39, s44 +; GFX9-O0-NEXT: s_mov_b32 s37, s44 +; GFX9-O0-NEXT: s_mov_b32 s38, s43 +; GFX9-O0-NEXT: s_mov_b32 s39, s42 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[40:41] ; GFX9-O0-NEXT: s_getpc_b64 s[42:43] ; GFX9-O0-NEXT: s_add_u32 s42, s42, strict_wwm_called@rel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s43, s43, strict_wwm_called@rel32@hi+12 @@ -396,8 +398,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GFX9-O0-NEXT: s_mov_b32 s33, s48 @@ -417,11 +419,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] ; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4 @@ -539,20 +539,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s50, s33 +; GFX9-O0-NEXT: s_mov_b32 s48, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill @@ -560,48 +560,59 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 -; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[38:39], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 -; GFX9-O0-NEXT: s_mov_b32 s40, s6 +; GFX9-O0-NEXT: s_mov_b32 s38, s6 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 -; GFX9-O0-NEXT: s_mov_b32 s41, s7 -; GFX9-O0-NEXT: s_mov_b32 s35, s41 -; GFX9-O0-NEXT: s_mov_b32 s42, s40 +; GFX9-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s39, s7 +; GFX9-O0-NEXT: s_mov_b32 s35, s39 +; GFX9-O0-NEXT: s_mov_b32 s44, s38 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 -; GFX9-O0-NEXT: s_mov_b32 s43, s37 -; GFX9-O0-NEXT: s_mov_b32 s44, s36 -; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47 -; GFX9-O0-NEXT: s_mov_b32 s45, s43 -; GFX9-O0-NEXT: s_mov_b32 s46, s42 -; GFX9-O0-NEXT: s_mov_b32 s47, s35 -; GFX9-O0-NEXT: v_writelane_b32 v0, s44, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s45, 3 -; GFX9-O0-NEXT: v_writelane_b32 v0, s46, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s47, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] +; GFX9-O0-NEXT: s_mov_b32 s45, s37 +; GFX9-O0-NEXT: s_mov_b32 s40, s36 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43 +; GFX9-O0-NEXT: s_mov_b32 s41, s45 +; GFX9-O0-NEXT: s_mov_b32 s42, s44 +; GFX9-O0-NEXT: s_mov_b32 s43, s35 +; GFX9-O0-NEXT: v_writelane_b32 v1, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v1, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s42, 2 +; GFX9-O0-NEXT: v_writelane_b32 v1, s43, 3 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_mov_b32 s38, s35 ; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 +; GFX9-O0-NEXT: s_mov_b32 s40, s37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr42_sgpr43 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s38 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[38:39], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s40 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[38:39] ; GFX9-O0-NEXT: s_mov_b64 exec, s[38:39] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 killed $sgpr34_sgpr35 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v1, s35, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s36 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[34:35] +; GFX9-O0-NEXT: ; implicit-def: $sgpr34 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s34, v[8:9] +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s34, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-O0-NEXT: s_getpc_b64 s[34:35] ; GFX9-O0-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 @@ -610,24 +621,24 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b64 s[36:37], s[0:1] ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 0 -; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 1 -; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2 -; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3 -; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4 -; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5 +; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 5 +; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 0 +; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 1 +; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2 +; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 @@ -645,21 +656,21 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: ; kill: killed $vgpr0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 -; GFX9-O0-NEXT: s_mov_b32 s33, s50 +; GFX9-O0-NEXT: s_mov_b32 s33, s48 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -671,7 +682,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill @@ -682,25 +692,25 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: s_getpc_b64 s[34:35] ; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 ; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[36:37] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 @@ -735,8 +745,10 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 @@ -747,73 +759,82 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: s_mov_b32 s34, 5 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[36:39], s34 offen -; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16 +; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[36:39], s34 offen +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s42, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 -; GFX9-O0-NEXT: s_mov_b32 s43, s35 +; GFX9-O0-NEXT: s_mov_b32 s44, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45 +; GFX9-O0-NEXT: s_mov_b32 s45, s35 +; GFX9-O0-NEXT: s_mov_b32 s42, s45 +; GFX9-O0-NEXT: ; implicit-def: $sgpr46_sgpr47 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[40:41] +; GFX9-O0-NEXT: s_mov_b32 s35, s44 +; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[40:41] +; GFX9-O0-NEXT: ; implicit-def: $sgpr43 +; GFX9-O0-NEXT: ; implicit-def: $sgpr43 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[40:41] +; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[40:41] +; GFX9-O0-NEXT: ; implicit-def: $sgpr43 +; GFX9-O0-NEXT: ; implicit-def: $sgpr43 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr42_sgpr43 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[40:41] +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[36:39], s34 offen -; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[36:39], s34 offen +; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -823,50 +844,38 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen -; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 +; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[4:7], 0 offen +; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[4:7], 0 offen offset:16 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: s_mov_b32 s36, -1 -; GFX9-O3-NEXT: s_brev_b32 s37, -2 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s36 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s37 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s36 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s37 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s36 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s37 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 ; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen -; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 +; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[4:7], 0 offen +; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[4:7], 0 offen offset:16 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -874,6 +883,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O3-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] @@ -908,9 +918,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -961,110 +973,113 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v34, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v35, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v39, s17 -; GFX9-O0-NEXT: v_mov_b32_e32 v38, s18 -; GFX9-O0-NEXT: v_mov_b32_e32 v37, s19 -; GFX9-O0-NEXT: v_mov_b32_e32 v36, s20 -; GFX9-O0-NEXT: v_mov_b32_e32 v35, s21 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-O0-NEXT: v_mov_b32_e32 v40, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v39, s19 +; GFX9-O0-NEXT: v_mov_b32_e32 v38, s20 +; GFX9-O0-NEXT: v_mov_b32_e32 v37, s21 +; GFX9-O0-NEXT: v_mov_b32_e32 v36, s22 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v46, s23 -; GFX9-O0-NEXT: v_mov_b32_e32 v45, s24 -; GFX9-O0-NEXT: v_mov_b32_e32 v44, s25 -; GFX9-O0-NEXT: v_mov_b32_e32 v43, s26 -; GFX9-O0-NEXT: v_mov_b32_e32 v42, s27 -; GFX9-O0-NEXT: v_mov_b32_e32 v41, s28 -; GFX9-O0-NEXT: v_mov_b32_e32 v40, s29 +; GFX9-O0-NEXT: v_mov_b32_e32 v46, s24 +; GFX9-O0-NEXT: v_mov_b32_e32 v45, s25 +; GFX9-O0-NEXT: v_mov_b32_e32 v44, s26 +; GFX9-O0-NEXT: v_mov_b32_e32 v43, s27 +; GFX9-O0-NEXT: v_mov_b32_e32 v42, s28 +; GFX9-O0-NEXT: v_mov_b32_e32 v41, s29 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v39 -; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v38 -; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v37 -; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v36 -; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v40 +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v39 +; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v38 +; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v37 +; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v36 +; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(5) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v46 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v45 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v44 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v42 -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v41 -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v40 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v46 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v45 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v44 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v42 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v41 ; GFX9-O0-NEXT: s_waitcnt vmcnt(5) -; GFX9-O0-NEXT: v_mov_b32_e32 v26, v39 +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v40 ; GFX9-O0-NEXT: s_waitcnt vmcnt(4) -; GFX9-O0-NEXT: v_mov_b32_e32 v27, v38 +; GFX9-O0-NEXT: v_mov_b32_e32 v27, v39 ; GFX9-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-O0-NEXT: v_mov_b32_e32 v28, v37 +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v38 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_mov_b32_e32 v29, v36 +; GFX9-O0-NEXT: v_mov_b32_e32 v29, v37 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v30, v35 -; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr34 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v30, v36 +; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr35 killed $exec ; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -1104,63 +1119,100 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 -; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-O0-NEXT: s_mov_b32 s37, s39 +; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v11 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: s_mov_b32 s36, s38 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35] +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v32 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v9 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35] +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v32 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v7 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35] +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v32 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35] +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v32 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v3 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35] +; GFX9-O0-NEXT: ; implicit-def: $sgpr36 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v32 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10 @@ -1219,8 +1271,10 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -1242,32 +1296,32 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O3-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:16 ; GFX9-O3-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v1, s[34:35] ; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v2, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v1, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v32 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v33 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v3, s[34:35] ; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v4, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v3, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v32 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v33 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v5, s[34:35] ; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v5, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v32 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v33 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[34:35] ; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v8, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v32 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v33 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v9, s[34:35] ; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v10, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v9, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v32 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v33 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 7f0db3e..7fecab0 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -26,15 +26,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -147,11 +147,10 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -178,6 +177,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -336,40 +338,40 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: v_readlane_b32 s1, v7, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s3, s9 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; GFX9-O0-NEXT: s_mov_b32 s9, s17 -; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 +; GFX9-O0-NEXT: s_mov_b32 s3, s7 +; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: s_mov_b32 s7, s9 +; GFX9-O0-NEXT: s_mov_b32 s16, s8 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s9 -; GFX9-O0-NEXT: s_mov_b32 s18, s8 +; GFX9-O0-NEXT: s_mov_b32 s17, s7 +; GFX9-O0-NEXT: s_mov_b32 s18, s6 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 -; GFX9-O0-NEXT: s_mov_b32 s3, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 +; GFX9-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 9 +; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[2:3] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -398,13 +400,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10 +; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 +; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 +; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 +; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -434,15 +436,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] @@ -587,41 +587,56 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_readlane_b32 s1, v12, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s6, s19 -; GFX9-O0-NEXT: s_mov_b32 s7, s18 -; GFX9-O0-NEXT: s_mov_b32 s15, s17 +; GFX9-O0-NEXT: s_mov_b32 s6, s9 +; GFX9-O0-NEXT: s_mov_b32 s7, s8 +; GFX9-O0-NEXT: s_mov_b32 s8, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s15 +; GFX9-O0-NEXT: s_mov_b32 s17, s8 ; GFX9-O0-NEXT: s_mov_b32 s18, s7 ; GFX9-O0-NEXT: s_mov_b32 s19, s6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-O0-NEXT: s_mov_b32 s15, s7 +; GFX9-O0-NEXT: s_mov_b32 s8, s3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9] ; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 8 +; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] +; GFX9-O0-NEXT: ; implicit-def: $sgpr2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: s_mov_b32 s2, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s2, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -631,11 +646,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 ; GFX9-O0-NEXT: s_mov_b32 s9, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: s_mov_b32 s0, 32 -; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s0, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 ; GFX9-O0-NEXT: s_getpc_b64 s[0:1] ; GFX9-O0-NEXT: s_add_u32 s0, s0, called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s1, s1, called_i64@gotpcrel32@hi+12 @@ -650,20 +660,20 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5 +; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -701,14 +711,13 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: s_getpc_b64 s[2:3] @@ -717,6 +726,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[20:21] ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] @@ -724,14 +734,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O3-NEXT: s_mov_b32 s13, s7 ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc +; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc ; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 @@ -763,109 +773,103 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: s_mov_b32 s4, 5 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen -; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16 +; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s8, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GFX9-O0-NEXT: s_mov_b32 s11, s5 +; GFX9-O0-NEXT: s_mov_b32 s8, s11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-O0-NEXT: s_mov_b32 s5, s10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen -; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen +; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: _amdgpu_cs_main: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen -; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 +; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen +; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-O3-NEXT: s_mov_b32 s6, -1 -; GFX9-O3-NEXT: s_brev_b32 s7, -2 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[4:5] ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 ; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen -; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 +; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen +; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: s_endpgm %tmp17 = shl i32 %index, 5 %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) @@ -915,15 +919,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -1036,11 +1040,10 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -1067,6 +1070,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -1225,40 +1231,40 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: v_readlane_b32 s1, v7, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s3, s9 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; GFX9-O0-NEXT: s_mov_b32 s9, s17 -; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 +; GFX9-O0-NEXT: s_mov_b32 s3, s7 +; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: s_mov_b32 s7, s9 +; GFX9-O0-NEXT: s_mov_b32 s16, s8 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s9 -; GFX9-O0-NEXT: s_mov_b32 s18, s8 +; GFX9-O0-NEXT: s_mov_b32 s17, s7 +; GFX9-O0-NEXT: s_mov_b32 s18, s6 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 -; GFX9-O0-NEXT: s_mov_b32 s3, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 +; GFX9-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 9 +; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[2:3] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1287,13 +1293,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10 +; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 +; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 +; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 +; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1323,15 +1329,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] @@ -1476,41 +1480,56 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_readlane_b32 s1, v12, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s6, s19 -; GFX9-O0-NEXT: s_mov_b32 s7, s18 -; GFX9-O0-NEXT: s_mov_b32 s15, s17 +; GFX9-O0-NEXT: s_mov_b32 s6, s9 +; GFX9-O0-NEXT: s_mov_b32 s7, s8 +; GFX9-O0-NEXT: s_mov_b32 s8, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s15 +; GFX9-O0-NEXT: s_mov_b32 s17, s8 ; GFX9-O0-NEXT: s_mov_b32 s18, s7 ; GFX9-O0-NEXT: s_mov_b32 s19, s6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 +; GFX9-O0-NEXT: s_mov_b32 s15, s7 +; GFX9-O0-NEXT: s_mov_b32 s8, s3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9] ; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 8 +; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 9 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] +; GFX9-O0-NEXT: ; implicit-def: $sgpr2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: s_mov_b32 s2, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s2, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1520,11 +1539,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 ; GFX9-O0-NEXT: s_mov_b32 s9, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: s_mov_b32 s0, 32 -; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s0, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 ; GFX9-O0-NEXT: s_getpc_b64 s[0:1] ; GFX9-O0-NEXT: s_add_u32 s0, s0, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s1, s1, strict_wwm_called_i64@gotpcrel32@hi+12 @@ -1539,20 +1553,20 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5 +; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1590,14 +1604,13 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: s_getpc_b64 s[2:3] @@ -1606,6 +1619,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[20:21] ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] @@ -1613,14 +1627,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O3-NEXT: s_mov_b32 s13, s7 ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc +; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc ; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 @@ -1652,109 +1666,103 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: s_mov_b32 s4, 5 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen -; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16 +; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s8, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GFX9-O0-NEXT: s_mov_b32 s11, s5 +; GFX9-O0-NEXT: s_mov_b32 s8, s11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-O0-NEXT: s_mov_b32 s5, s10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen -; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen +; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen -; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 +; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen +; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-O3-NEXT: s_mov_b32 s6, -1 -; GFX9-O3-NEXT: s_brev_b32 s7, -2 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[4:5] ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 ; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen -; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 +; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen +; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: s_endpgm %tmp17 = shl i32 %index, 5 %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index f6db9c4..176dfee 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -1,18 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT -; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX +; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT +; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) { ; PTX-LABEL: grid_const_int( -; PTX-NOT: ld.u32 -; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0]; -; +; PTX: { +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [grid_const_int_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_int_param_1]; +; PTX-NEXT: ld.param.u32 %r2, [grid_const_int_param_0]; +; PTX-NEXT: add.s32 %r3, %r2, %r1; +; PTX-NEXT: st.global.u32 [%rd2], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_int( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) { -; OPT-NOT: alloca -; OPT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 -; +; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr +; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 +; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]] +; OPT-NEXT: store i32 [[ADD]], ptr [[OUT3]], align 4 +; OPT-NEXT: ret void %tmp = load i32, ptr %input1, align 4 %add = add i32 %tmp, %input2 store i32 %add, ptr %out @@ -24,19 +36,29 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){ ; PTX-LABEL: grid_const_struct( ; PTX: { -; PTX-NOT: ld.u32 -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_struct_param_0]; -; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_struct_param_0+4]; -; +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [grid_const_struct_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_struct_param_0]; +; PTX-NEXT: ld.param.u32 %r2, [grid_const_struct_param_0+4]; +; PTX-NEXT: add.s32 %r3, %r1, %r2; +; PTX-NEXT: st.global.u32 [%rd2], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_struct( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) { -; OPT-NOT: alloca -; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 -; OPT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 -; OPT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 -; OPT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr +; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 +; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 +; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 +; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]] +; OPT-NEXT: store i32 [[ADD]], ptr [[OUT5]], align 4 +; OPT-NEXT: ret void %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 %int1 = load i32, ptr %gep1 @@ -49,41 +71,85 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-LABEL: grid_const_escape( ; PTX: { -; PTX-NOT: .local -; PTX: cvta.param.{{.*}} +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_escape_param_0; +; PTX-NEXT: mov.u64 %rd2, %rd1; +; PTX-NEXT: cvta.param.u64 %rd3, %rd2; +; PTX-NEXT: { // callseq 0, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd3; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r1, [retval0+0]; +; PTX-NEXT: } // callseq 0 +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) { -; OPT-NOT: alloca [[STRUCT_S]] -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) +; OPT-NEXT: ret void %call = call i32 @escape(ptr %input) ret void } define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) { ; PTX-LABEL: multiple_grid_const_escape( -; PTX: mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0; -; PTX: mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2; -; PTX: mov.{{.*}} [[RD3:%.*]], [[RD2]]; -; PTX: mov.{{.*}} [[RD4:%.*]], [[RD1]]; -; PTX: cvta.param.{{.*}} [[RD5:%.*]], [[RD4]]; -; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD3]]; -; PTX: { -; PTX: st.param.b64 [param0+0], [[RD5]]; -; PTX: st.param.b64 [param2+0], [[RD6]]; -; +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot3[4]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.u64 %SPL, __local_depot3; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: mov.b64 %rd1, multiple_grid_const_escape_param_0; +; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_2; +; PTX-NEXT: mov.u64 %rd3, %rd2; +; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1]; +; PTX-NEXT: cvta.param.u64 %rd4, %rd3; +; PTX-NEXT: mov.u64 %rd5, %rd1; +; PTX-NEXT: cvta.param.u64 %rd6, %rd5; +; PTX-NEXT: add.u64 %rd7, %SP, 0; +; PTX-NEXT: add.u64 %rd8, %SPL, 0; +; PTX-NEXT: st.local.u32 [%rd8], %r1; +; PTX-NEXT: { // callseq 1, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd6; +; PTX-NEXT: .param .b64 param1; +; PTX-NEXT: st.param.b64 [param1+0], %rd7; +; PTX-NEXT: .param .b64 param2; +; PTX-NEXT: st.param.b64 [param2+0], %rd4; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape3, +; PTX-NEXT: ( +; PTX-NEXT: param0, +; PTX-NEXT: param1, +; PTX-NEXT: param2 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; PTX-NEXT: } // callseq 1 +; PTX-NEXT: ret; ; OPT-LABEL: define void @multiple_grid_const_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) { -; OPT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT-NOT: alloca %struct.s -; OPT: [[A_ADDR:%.*]] = alloca i32, align 4 -; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; OPT-NEXT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 ; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]]) -; +; OPT-NEXT: ret void %a.addr = alloca i32, align 4 store i32 %a, ptr %a.addr, align 4 %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b) @@ -92,40 +158,58 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) { ; PTX-LABEL: grid_const_memory_escape( -; PTX-NOT: .local -; PTX: mov.b64 [[RD1:%.*]], grid_const_memory_escape_param_0; -; PTX: cvta.param.u64 [[RD3:%.*]], [[RD2:%.*]]; -; PTX: st.global.u64 [[[RD4:%.*]]], [[RD3]]; -; +; PTX: { +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_memory_escape_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.u64 %rd4, %rd1; +; PTX-NEXT: cvta.param.u64 %rd5, %rd4; +; PTX-NEXT: st.global.u64 [%rd3], %rd5; +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_memory_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) { -; OPT-NOT: alloca [[STRUCT_S]] -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: store ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, align 8 -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1) +; OPT-NEXT: [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR5]], align 8 +; OPT-NEXT: ret void store ptr %input, ptr %addr, align 8 ret void } define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) { ; PTX-LABEL: grid_const_inlineasm_escape( -; PTX-NOT .local -; PTX: add.{{.*}} [[RD2:%.*]], [[RD1:%.*]], 4; -; PTX: cvta.param.u64 [[RD4:%.*]], [[RD2]] -; PTX: cvta.param.u64 [[RD3:%.*]], [[RD1]] -; PTX: add.s64 [[RD5:%.*]], [[RD3]], [[RD4]]; -; +; PTX: { +; PTX-NEXT: .reg .b64 %rd<8>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0; +; PTX-NEXT: ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5; +; PTX-NEXT: mov.u64 %rd7, %rd4; +; PTX-NEXT: cvta.param.u64 %rd2, %rd7; +; PTX-NEXT: add.s64 %rd3, %rd2, 4; +; PTX-NEXT: // begin inline asm +; PTX-NEXT: add.s64 %rd1, %rd2, %rd3; +; PTX-NEXT: // end inline asm +; PTX-NEXT: st.global.u64 [%rd6], %rd1; +; PTX-NEXT: ret; +; PTX-NOT .local ; OPT-LABEL: define void @grid_const_inlineasm_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) { -; OPT-NOT: alloca [[STRUCT_S]] -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[TMPPTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 0 -; OPT: [[TMPPTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 1 -; OPT: [[TMPPTR22_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR22]]) -; OPT: [[TMPPTR13_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR13]]) -; OPT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1) +; OPT-NEXT: [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 +; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 +; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 +; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT5]], align 8 +; OPT-NEXT: ret void %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1 @@ -135,24 +219,42 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escape( -; PTX-NOT: .local -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escape_param_0]; -; PTX: add.{{.*}} -; PTX: cvta.param.u64 [[RD3:%.*]], {{%.*}} -; PTX: st.param.{{.*}} [param0+0], [[RD3]] -; PTX: call -; +; PTX: { +; PTX-NEXT: .reg .b32 %r<5>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escape_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_partial_escape_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.u64 %rd4, %rd1; +; PTX-NEXT: cvta.param.u64 %rd5, %rd4; +; PTX-NEXT: ld.u32 %r1, [%rd5]; +; PTX-NEXT: add.s32 %r2, %r1, %r1; +; PTX-NEXT: st.global.u32 [%rd3], %r2; +; PTX-NEXT: { // callseq 2, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd5; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r3, [retval0+0]; +; PTX-NEXT: } // callseq 2 +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_partial_escape( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]], ptr {{%.*}}) { -; OPT-NOT: alloca -; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT1]], align 4 -; OPT: [[TWICE:%.*]] = add i32 [[VAL]], [[VAL]] -; OPT: store i32 [[TWICE]] -; OPT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) -; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) -; OPT: ret void -; +; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr +; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) +; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4 +; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]] +; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT5]], align 4 +; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) +; OPT-NEXT: ret void %val = load i32, ptr %input %twice = add i32 %val, %val store i32 %twice, ptr %output @@ -163,27 +265,46 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escapemem( ; PTX: { -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escapemem_param_0]; -; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_partial_escapemem_param_0+4]; -; PTX: cvta.param.{{.*}} [[RD5:%.*]], {{%.*}}; -; PTX: st.global.{{.*}} [{{.*}}], [[RD5]]; -; PTX: add.s32 [[R3:%.*]], [[R1]], [[R2]] -; PTX: st.param.{{.*}} [param0+0], [[RD5]] -; PTX: escape +; PTX-NEXT: .reg .b32 %r<6>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escapemem_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_partial_escapemem_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.u64 %rd4, %rd1; +; PTX-NEXT: cvta.param.u64 %rd5, %rd4; +; PTX-NEXT: ld.u32 %r1, [%rd5]; +; PTX-NEXT: ld.u32 %r2, [%rd5+4]; +; PTX-NEXT: st.global.u64 [%rd3], %rd5; +; PTX-NEXT: add.s32 %r3, %r1, %r2; +; PTX-NEXT: { // callseq 3, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd5; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r4, [retval0+0]; +; PTX-NEXT: } // callseq 3 +; PTX-NEXT: st.param.b32 [func_retval0+0], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define i32 @grid_const_partial_escapemem( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr {{%.*}}) { -; OPT-NOT: alloca -; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[PTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 0 -; OPT: [[VAL1:%.*]] = load i32, ptr addrspace(101) [[PTR13]], align 4 -; OPT: [[PTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 1 -; OPT: [[VAL2:%.*]] = load i32, ptr addrspace(101) [[PTR22]], align 4 -; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) -; OPT: store ptr [[INPUT1]] -; OPT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] -; OPT: [[PTR1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[PTR13]]) -; OPT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr +; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) +; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 +; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4 +; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 +; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4 +; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8 +; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] +; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) +; OPT-NEXT: ret i32 [[ADD]] %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %val1 = load i32, ptr %ptr1 %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 @@ -194,29 +315,48 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu ret i32 %add } -define void @grid_const_phi_escape(ptr byval(%struct.s) align 4 %input1, ptr %inout) { -; PTX-LABEL: grid_const_phi_escape( -; PTX: cvta.param.{{.*}} [[RD1:%.*]], {{.*}} -; PTX: @[[P1:%.*]] bra $L__BB[[TARGET_LABEL:[_0-9]+]]; -; PTX: $L__BB[[TARGET_LABEL]]: -; PTX: ld.{{.*}} [[R1:%.*]], [[[RD1]]]; -; -; OPT-LABEL: define void @grid_const_phi_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr {{%.*}}) { -; OPT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) -; OPT: br i1 {{.*}}, label %[[FIRST:.*]], label %[[SECOND:.*]] +define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) { +; PTX-LABEL: grid_const_phi( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd5, grid_const_phi_param_0; +; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6; +; PTX-NEXT: mov.u64 %rd7, %rd5; +; PTX-NEXT: cvta.param.u64 %rd8, %rd7; +; PTX-NEXT: ld.global.u32 %r1, [%rd1]; +; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; +; PTX-NEXT: @%p1 bra $L__BB8_2; +; PTX-NEXT: // %bb.1: // %second +; PTX-NEXT: add.s64 %rd8, %rd8, 4; +; PTX-NEXT: $L__BB8_2: // %merge +; PTX-NEXT: ld.u32 %r2, [%rd8]; +; PTX-NEXT: st.global.u32 [%rd1], %r2; +; PTX-NEXT: ret; +; OPT-LABEL: define void @grid_const_phi( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 +; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; OPT: [[FIRST]]: -; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 -; OPT: br label %[[MERGE:.*]] +; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 +; OPT-NEXT: br label %[[MERGE:.*]] ; OPT: [[SECOND]]: -; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1 -; OPT: br label %[[MERGE]] +; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1 +; OPT-NEXT: br label %[[MERGE]] ; OPT: [[MERGE]]: -; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] -; OPT-NOT: load i32, ptr addrspace(101) -; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; +; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 @@ -235,32 +375,53 @@ merge: } ; NOTE: %input2 is *not* grid_constant -define void @grid_const_phi_escape2(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) { -; PTX-LABEL: grid_const_phi_escape2( -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_phi_escape2_param_1+4]; -; PTX: @[[P1:%.*]] bra $L__BB[[LABEL:[_0-9]+]]; -; PTX: cvta.param.u64 [[RD1:%.*]], [[RD2:%.*]]; -; PTX: ld.u32 [[R1]], [[[RD1]]]; -; PTX: $L__BB[[LABEL]]: -; PTX: st.global.u32 [[[RD3:%.*]]], [[R1]] -; OPT-LABEL: define void @grid_const_phi_escape2( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr {{%.*}}) { -; OPT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 -; OPT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; OPT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 -; OPT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 -; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]]) -; OPT: br i1 [[LESS:%.*]], label %[[FIRST:.*]], label %[[SECOND:.*]] +define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) { +; PTX-LABEL: grid_const_phi_ngc( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<12>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd6, grid_const_phi_ngc_param_0; +; PTX-NEXT: ld.param.u64 %rd7, [grid_const_phi_ngc_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd7; +; PTX-NEXT: mov.u64 %rd10, %rd6; +; PTX-NEXT: cvta.param.u64 %rd11, %rd10; +; PTX-NEXT: ld.global.u32 %r1, [%rd1]; +; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; +; PTX-NEXT: @%p1 bra $L__BB9_2; +; PTX-NEXT: // %bb.1: // %second +; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1; +; PTX-NEXT: mov.u64 %rd9, %rd8; +; PTX-NEXT: cvta.param.u64 %rd2, %rd9; +; PTX-NEXT: add.s64 %rd11, %rd2, 4; +; PTX-NEXT: $L__BB9_2: // %merge +; PTX-NEXT: ld.u32 %r2, [%rd11]; +; PTX-NEXT: st.global.u32 [%rd1], %r2; +; PTX-NEXT: ret; +; OPT-LABEL: define void @grid_const_phi_ngc( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 +; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; OPT: [[FIRST]]: -; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 -; OPT: br label %[[MERGE:.*]] +; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 +; OPT-NEXT: br label %[[MERGE:.*]] ; OPT: [[SECOND]]: -; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 -; OPT: br label %[[MERGE]] +; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1 +; OPT-NEXT: br label %[[MERGE]] ; OPT: [[MERGE]]: -; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] -; +; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 br i1 %less, label %first, label %second @@ -278,22 +439,42 @@ merge: } ; NOTE: %input2 is *not* grid_constant -define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) { -; PTX-LABEL: grid_const_select_escape( -; PTX: cvta.param.{{.*}} [[RD2:%.*]], [[RD1:%.*]] -; PTX: setp.lt.{{.*}} [[P1:%.*]], {{%.*}}, 0 -; PTX: add.{{.*}} [[RD3:%.*]], %SP, 0; -; PTX: selp.{{.*}} [[RD4:%.*]], [[RD2]], [[RD3]], [[P1]]; -; PTX: ld.u32 {{%.*}}, [[[RD4]]]; -; OPT-LABEL: define void @grid_const_select_escape( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) { -; OPT: [[INPUT24:%.*]] = alloca i32, align 4 -; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]]) -; OPT: load i32, ptr [[INOUT]] -; OPT: [[PTRNEW:%.*]] = select i1 [[LESS:%.*]], ptr [[INPUT11]], ptr [[INPUT24]] -; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; +define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) { +; PTX-LABEL: grid_const_select( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<10>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_select_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1; +; PTX-NEXT: mov.u64 %rd5, %rd4; +; PTX-NEXT: cvta.param.u64 %rd6, %rd5; +; PTX-NEXT: mov.u64 %rd7, %rd1; +; PTX-NEXT: cvta.param.u64 %rd8, %rd7; +; PTX-NEXT: ld.global.u32 %r1, [%rd3]; +; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; +; PTX-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1; +; PTX-NEXT: ld.u32 %r2, [%rd9]; +; PTX-NEXT: st.global.u32 [%rd3], %r2; +; PTX-NEXT: ret; +; OPT-LABEL: define void @grid_const_select( +; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 +; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] +; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 %ptrnew = select i1 %less, ptr %input1, ptr %input2 @@ -304,16 +485,27 @@ define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval( define i32 @grid_const_ptrtoint(ptr byval(i32) %input) { ; PTX-LABEL: grid_const_ptrtoint( -; PTX-NOT: .local -; PTX: ld.param.{{.*}} {{%.*}}, [grid_const_ptrtoint_param_0]; -; PTX: cvta.param.u64 [[RD1:%.*]], {{%.*}} -; PTX: cvt.u32.u64 {{%.*}}, [[RD1]] +; PTX: { +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0; +; PTX-NEXT: mov.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_ptrtoint_param_0]; +; PTX-NEXT: cvta.param.u64 %rd3, %rd2; +; PTX-NEXT: cvt.u32.u64 %r2, %rd3; +; PTX-NEXT: add.s32 %r3, %r1, %r2; +; PTX-NEXT: st.param.b32 [func_retval0+0], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define i32 @grid_const_ptrtoint( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) { -; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT2]] -; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) -; OPT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32 +; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4 +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) +; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32 +; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]] +; OPT-NEXT: ret i32 [[KEEPALIVE]] %val = load i32, ptr %input %ptrval = ptrtoint ptr %input to i32 %keepalive = add i32 %val, %ptrval @@ -352,13 +544,13 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr !14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15} !15 = !{i32 1} -!16 = !{ptr @grid_const_phi_escape, !"kernel", i32 1, !"grid_constant", !17} +!16 = !{ptr @grid_const_phi, !"kernel", i32 1, !"grid_constant", !17} !17 = !{i32 1} -!18 = !{ptr @grid_const_phi_escape2, !"kernel", i32 1, !"grid_constant", !19} +!18 = !{ptr @grid_const_phi_ngc, !"kernel", i32 1, !"grid_constant", !19} !19 = !{i32 1} -!20 = !{ptr @grid_const_select_escape, !"kernel", i32 1, !"grid_constant", !21} +!20 = !{ptr @grid_const_select, !"kernel", i32 1, !"grid_constant", !21} !21 = !{i32 1} !22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23} diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index f041f20..a414a6c 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -1,166 +1,469 @@ -; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32 -; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} - -%struct.ham = type { [4 x i32] } - -; // Verify that load with static offset into parameter is done directly. -; CHECK-LABEL: .visible .entry static_offset -; CHECK-NOT: .local -; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] -; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 -; CHECK64: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] -; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] -; -; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0] -; CHECK32: mov.b32 %[[param_addr:r[0-9]+]], {{.*}}_param_1 -; CHECK32: mov.u32 %[[param_addr1:r[0-9]+]], %[[param_addr]] -; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]] -; -; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12]; -; CHECK: st.global.u32 [[[result_addr_g]]], [[value]]; -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { -bb: - %tmp = icmp eq i32 %arg2, 3 - br i1 %tmp, label %bb3, label %bb6 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70 +source_filename = "<stdin>" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.S = type { i32, i32 } + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +declare dso_local void @_Z6escapePv(ptr noundef) local_unnamed_addr #0 + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +declare dso_local void @_Z6escapei(i32 noundef) local_unnamed_addr #0 -bb3: ; preds = %bb - %tmp4 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 3 - %tmp5 = load i32, ptr %tmp4, align 4 - store i32 %tmp5, ptr %arg, align 4 - br label %bb6 +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1 -bb6: ; preds = %bb3, %bb +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2 + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @read_only( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void +; +entry: + %i = load i32, ptr %s, align 4 + store i32 %i, ptr %out, align 4 ret void } -; // Verify that load with dynamic offset into parameter is also done directly. -; CHECK-LABEL: .visible .entry dynamic_offset -; CHECK-NOT: .local -; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] -; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 -; CHECK64: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] -; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] -; CHECK64: add.s64 %[[param_w_offset:rd[0-9]+]], %[[param_addr1]], +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @read_only_gep( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void ; -; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0] -; CHECK32: mov.b32 %[[param_addr:r[0-9]+]], {{.*}}_param_1 -; CHECK32: mov.u32 %[[param_addr1:r[0-9]+]], %[[param_addr]] -; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]] -; CHECK32: add.s32 %[[param_w_offset:r[0-9]+]], %[[param_addr1]], +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + %i = load i32, ptr %b, align 4 + store i32 %i, ptr %out, align 4 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @read_only_gep_asc( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void ; -; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_w_offset]]]; -; CHECK: st.global.u32 [[[result_addr_g]]], [[value]]; +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + %asc = addrspacecast ptr %b to ptr addrspace(101) + %i = load i32, ptr addrspace(101) %asc, align 4 + store i32 %i, ptr %out, align 4 + ret void +} -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { -bb: - %tmp = sext i32 %arg2 to i64 - %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp - %tmp4 = load i32, ptr %tmp3, align 4 - store i32 %tmp4, ptr %arg, align 4 +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @read_only_gep_asc0( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void +; +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + %asc = addrspacecast ptr %b to ptr addrspace(101) + %asc0 = addrspacecast ptr addrspace(101) %asc to ptr + %i = load i32, ptr %asc0, align 4 + store i32 %i, ptr %out, align 4 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptr( +; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) +; COMMON-NEXT: ret void +; +entry: + call void @_Z6escapePv(ptr noundef nonnull %s) #0 ret void } -; Same as above, but with a bitcast present in the chain -; CHECK-LABEL:.visible .entry gep_bitcast -; CHECK-NOT: .local -; CHECK64-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_param_0] -; CHECK64-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_param_1 +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptr_gep( +; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; COMMON-NEXT: ret void ; -; CHECK32-DAG: ld.param.u32 [[out:%r[0-9]+]], [gep_bitcast_param_0] -; CHECK32-DAG: mov.b32 {{%r[0-9]+}}, gep_bitcast_param_1 +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + call void @_Z6escapePv(ptr noundef nonnull %b) #0 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptr_store( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 +; COMMON-NEXT: ret void ; -; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_param_2] -; CHECK64: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}] -; CHECK64: st.global.u8 [{{%rd[0-9]+}}], [[value]]; -; CHECK32: ld.param.u8 [[value:%rs[0-9]+]], [{{%r[0-9]+}}] -; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]]; +entry: + store ptr %s, ptr %out, align 8 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptr_gep_store( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; COMMON-NEXT: store ptr [[B]], ptr [[OUT2]], align 8 +; COMMON-NEXT: ret void +; +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + store ptr %b, ptr %out, align 8 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptrtoint( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 +; COMMON-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 +; COMMON-NEXT: ret void +; +entry: + %i = ptrtoint ptr %s to i64 + store i64 %i, ptr %out, align 8 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @memcpy_from_param( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; COMMON-NEXT: ret void +; +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @memcpy_to_param( +; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) +; COMMON-NEXT: ret void +; +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @copy_on_store( +; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[S3]], align 4 +; COMMON-NEXT: ret void ; -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @gep_bitcast(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 { bb: - %n64 = sext i32 %n to i64 - %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64 - %load = load i8, ptr %gep, align 4 - store i8 %load, ptr %out, align 4 + %i = load i32, ptr %in, align 4 + store i32 %i, ptr %s, align 4 ret void } -; Same as above, but with an ASC(101) present in the chain -; CHECK-LABEL:.visible .entry gep_bitcast_asc -; CHECK-NOT: .local -; CHECK64-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_asc_param_0] -; CHECK64-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_asc_param_1 +define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { +; SM_60-LABEL: define void @test_select( +; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; SM_60-NEXT: [[BB:.*:]] +; SM_60-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr +; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 +; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 +; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 +; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 +; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] +; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4 +; SM_60-NEXT: ret void ; -; CHECK32-DAG: ld.param.u32 [[out:%r[0-9]+]], [gep_bitcast_asc_param_0] -; CHECK32-DAG: mov.b32 {{%r[0-9]+}}, gep_bitcast_asc_param_1 +; SM_70-LABEL: define void @test_select( +; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; SM_70-NEXT: [[BB:.*:]] +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] +; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT2]], align 4 +; SM_70-NEXT: ret void ; -; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_asc_param_2] -; CHECK64: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}] -; CHECK64: st.global.u8 [{{%rd[0-9]+}}], [[value]]; -; CHECK32: ld.param.u8 [[value:%rs[0-9]+]], [{{%r[0-9]+}}] -; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]]; +bb: + %ptrnew = select i1 %cond, ptr %input1, ptr %input2 + %valloaded = load i32, ptr %ptrnew, align 4 + store i32 %valloaded, ptr %out, align 4 + ret void +} + +define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { +; COMMON-LABEL: define void @test_select_write( +; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr +; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 +; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 +; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 +; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] +; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; COMMON-NEXT: ret void ; -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @gep_bitcast_asc(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 { bb: - %n64 = sext i32 %n to i64 - %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64 - %asc = addrspacecast ptr %gep to ptr addrspace(101) - %load = load i8, ptr addrspace(101) %asc, align 4 - store i8 %load, ptr %out, align 4 - ret void -} - - -; Verify that if the pointer escapes, then we do fall back onto using a temp copy. -; CHECK-LABEL: .visible .entry pointer_escapes -; CHECK: .local .align 4 .b8 __local_depot{{.*}} -; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] -; CHECK64: add.u64 %[[copy_addr:rd[0-9]+]], %SPL, 0; -; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0] -; CHECK32: add.u32 %[[copy_addr:r[0-9]+]], %SPL, 0; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+12]; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+8]; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+4]; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1]; -; CHECK-DAG: st.local.u32 [%[[copy_addr]]+12], -; CHECK-DAG: st.local.u32 [%[[copy_addr]]+8], -; CHECK-DAG: st.local.u32 [%[[copy_addr]]+4], -; CHECK-DAG: st.local.u32 [%[[copy_addr]]], -; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] -; CHECK64: add.s64 %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]], -; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]] -; CHECK32: add.s32 %[[copy_w_offset:r[0-9]+]], %[[copy_addr]], -; CHECK: ld.local.u32 [[value:%r[0-9]+]], [%[[copy_w_offset]]]; -; CHECK: st.global.u32 [[[result_addr_g]]], [[value]]; - -; Function Attrs: convergent norecurse nounwind mustprogress -define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 { + %ptrnew = select i1 %cond, ptr %input1, ptr %input2 + store i32 1, ptr %ptrnew, align 4 + ret void +} + +define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) { +; SM_60-LABEL: define void @test_phi( +; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; SM_60-NEXT: [[BB:.*:]] +; SM_60-NEXT: [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr +; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 +; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 +; SM_60-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 +; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4 +; SM_60-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; SM_60: [[FIRST]]: +; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 +; SM_60-NEXT: br label %[[MERGE:.*]] +; SM_60: [[SECOND]]: +; SM_60-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 +; SM_60-NEXT: br label %[[MERGE]] +; SM_60: [[MERGE]]: +; SM_60-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT8]], align 4 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define void @test_phi( +; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; SM_70-NEXT: [[BB:.*:]] +; SM_70-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; SM_70-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; SM_70: [[FIRST]]: +; SM_70-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 +; SM_70-NEXT: br label %[[MERGE:.*]] +; SM_70: [[SECOND]]: +; SM_70-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1 +; SM_70-NEXT: br label %[[MERGE]] +; SM_70: [[MERGE]]: +; SM_70-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; SM_70-NEXT: ret void +; bb: - %tmp = sext i32 %arg2 to i64 - %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp - %tmp4 = load i32, ptr %tmp3, align 4 - store i32 %tmp4, ptr %arg, align 4 - %tmp5 = call ptr @escape(ptr nonnull %tmp3) #3 + br i1 %cond, label %first, label %second + +first: ; preds = %bb + %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0 + br label %merge + +second: ; preds = %bb + %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1 + br label %merge + +merge: ; preds = %second, %first + %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ] + %valloaded = load i32, ptr %ptrnew, align 4 + store i32 %valloaded, ptr %inout, align 4 ret void } -; Function Attrs: convergent nounwind -declare dso_local ptr @escape(ptr) local_unnamed_addr +define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) { +; COMMON-LABEL: define void @test_phi_write( +; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 +; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 +; COMMON-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 +; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; COMMON: [[FIRST]]: +; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 +; COMMON-NEXT: br label %[[MERGE:.*]] +; COMMON: [[SECOND]]: +; COMMON-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 +; COMMON-NEXT: br label %[[MERGE]] +; COMMON: [[MERGE]]: +; COMMON-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; COMMON-NEXT: ret void +; +bb: + br i1 %cond, label %first, label %second + +first: ; preds = %bb + %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0 + br label %merge + +second: ; preds = %bb + %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1 + br label %merge + +merge: ; preds = %second, %first + %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ] + store i32 1, ptr %ptrnew, align 4 + ret void +} +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } -!llvm.module.flags = !{!0, !1, !2} -!nvvm.annotations = !{!3, !4, !5, !6, !7} +!llvm.module.flags = !{!0, !1, !2, !3} +!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19} +!llvm.ident = !{!20, !21} -!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]} +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]} !1 = !{i32 1, !"wchar_size", i32 4} !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} -!3 = !{ptr @static_offset, !"kernel", i32 1} -!4 = !{ptr @dynamic_offset, !"kernel", i32 1} -!5 = !{ptr @pointer_escapes, !"kernel", i32 1} -!6 = !{ptr @gep_bitcast, !"kernel", i32 1} -!7 = !{ptr @gep_bitcast_asc, !"kernel", i32 1} +!3 = !{i32 7, !"frame-pointer", i32 2} +!4 = !{ptr @read_only, !"kernel", i32 1} +!5 = !{ptr @escape_ptr, !"kernel", i32 1} +!6 = !{ptr @escape_ptr_gep, !"kernel", i32 1} +!7 = !{ptr @escape_ptr_store, !"kernel", i32 1} +!8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1} +!9 = !{ptr @escape_ptrtoint, !"kernel", i32 1} +!10 = !{ptr @memcpy_from_param, !"kernel", i32 1} +!11 = !{ptr @memcpy_to_param, !"kernel", i32 1} +!12 = !{ptr @copy_on_store, !"kernel", i32 1} +!13 = !{ptr @read_only_gep, !"kernel", i32 1} +!14 = !{ptr @read_only_gep_asc, !"kernel", i32 1} +!15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1} +!16 = !{ptr @test_select, !"kernel", i32 1} +!17 = !{ptr @test_phi, !"kernel", i32 1} +!18 = !{ptr @test_phi_write, !"kernel", i32 1} +!19 = !{ptr @test_select_write, !"kernel", i32 1} +!20 = !{!"clang version 20.0.0git"} +!21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} diff --git a/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir b/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir index 8a83543..fd3630b 100644 --- a/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir +++ b/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir @@ -18,6 +18,9 @@ define i32 @foo2(i32 %a, i32 %b) comdat { ret i32 0 } define i32 @foo3(i32 %a, i32 %b) section ".abc" { ret i32 0 } + + define i32 @foo4(i32 %a, i32 %b) !section_prefix !0 { ret i32 0 } + !0 = !{!"function_section_prefix", !"myprefix"} ... --- name: foo @@ -27,23 +30,24 @@ body: | ; CHECK: bb.0: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: PseudoRET + ; ; CHECK-FS-LABEL: name: foo ; CHECK-FS: bb.0: ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 @@ -109,26 +113,27 @@ body: | ; CHECK: bb.0: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: PseudoRET + ; ; CHECK-FS-LABEL: name: foo2 ; CHECK-FS: bb.0: ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 @@ -223,6 +228,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: PseudoRET + ; ; CHECK-FS-LABEL: name: foo3 ; CHECK-FS: bb.0: ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 @@ -289,3 +295,89 @@ body: | bb.3: PseudoRET ... +--- +name: foo4 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: foo4 + ; CHECK: bb.0: + ; CHECK-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) + ; CHECK-NEXT: PseudoBR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) + ; CHECK-NEXT: PseudoBR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) + ; CHECK-NEXT: PseudoBR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: PseudoRET + ; + ; CHECK-FS-LABEL: name: foo4 + ; CHECK-FS: bb.0: + ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-FS-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) + ; CHECK-FS-NEXT: PseudoBR %bb.3 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: bb.1: + ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-FS-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) + ; CHECK-FS-NEXT: PseudoBR %bb.3 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: bb.2: + ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-FS-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) + ; CHECK-FS-NEXT: PseudoBR %bb.3 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: bb.3: + ; CHECK-FS-NEXT: PseudoRET + bb.0: + liveins: $x10, $x11, $x13 + + $x11 = ORI $x11, 1023 + $x12 = ADDI $x10, 17 + $x11 = AND $x12, $x11 + $x10 = SUB $x10, $x11 + $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) + PseudoBR %bb.3 + + bb.1: + liveins: $x10, $x11, $x13 + + $x11 = ORI $x11, 1023 + $x12 = ADDI $x10, 17 + $x11 = AND $x12, $x11 + $x10 = SUB $x10, $x11 + $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) + PseudoBR %bb.3 + + bb.2: + liveins: $x10, $x11, $x13 + + $x11 = ORI $x11, 1023 + $x12 = ADDI $x10, 17 + $x11 = AND $x12, $x11 + $x10 = SUB $x10, $x11 + $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar) + PseudoBR %bb.3 + + bb.3: + PseudoRET +... diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll index c0b14d2..5f0ba4a 100644 --- a/llvm/test/CodeGen/WebAssembly/half-precision.ll +++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll @@ -1,5 +1,4 @@ ; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128 | FileCheck %s -; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128 | FileCheck %s declare float @llvm.wasm.loadf32.f16(ptr) declare void @llvm.wasm.storef16.f32(float, ptr) @@ -28,6 +27,13 @@ define <8 x half> @splat_v8f16(float %x) { ret <8 x half> %v } +; CHECK-LABEL: const_splat_v8f16: +; CHECK: v128.const $push0=, 20800, 0, 0, 0, 0, 0, 0, 20800 +; CHECK-NEXT: return $pop0 +define <8 x half> @const_splat_v8f16() { + ret <8 x half> <half 42., half 0., half 0., half 0., half 0., half 0., half 0., half 42.> +} + ; CHECK-LABEL: extract_lane_v8f16: ; CHECK: f16x8.extract_lane $push0=, $0, 1 ; CHECK-NEXT: return $pop0 @@ -308,3 +314,24 @@ define <8 x i16> @trunc_sat_u_v8i16_sat(<8 x half> %x) { %a = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> %x) ret <8 x i16> %a } + +; ============================================================================== +; Load and Store +; ============================================================================== +define <8 x half> @load_v8f16(ptr %p) { +; CHECK-LABEL: load_v8f16: +; CHECK: .functype load_v8f16 (i32) -> (v128) +; CHECK-NEXT: v128.load $push0=, 0($0) +; CHECK-NEXT: return $pop0 + %v = load <8 x half>, ptr %p + ret <8 x half> %v +} + +define void @store_v8f16(<8 x half> %v, ptr %p) { +; CHECK-LABEL: store_v8f16: +; CHECK: .functype store_v8f16 (v128, i32) -> () +; CHECK-NEXT: v128.store 0($1), $0 +; CHECK-NEXT: return + store <8 x half> %v , ptr %p + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll index 430b0db..d1d1b0a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -7,8 +7,8 @@ target triple = "aarch64--linux-gnu" %pair = type { i8, i8 } ; CHECK-LABEL: test -; CHECK: Found an estimated cost of 16 for VF 2 For instruction: {{.*}} load i8 -; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 8 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 8 for VF 2 For instruction: {{.*}} load i8 ; CHECK-LABEL: entry: ; CHECK-LABEL: vector.body: ; CHECK: [[LOAD1:%.*]] = load i8 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index 21af9ae..dec124b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -168,8 +168,8 @@ entry: ; gaps. ; ; VF_2-LABEL: Checking a loop in 'i64_factor_8' -; VF_2: Found an estimated cost of 16 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 +; VF_2: Found an estimated cost of 8 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 for.body: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll index c7a04e3..976c6a9 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll @@ -15,10 +15,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i8_factor_2' -; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 @@ -56,10 +56,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i16_factor_2' -; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 @@ -97,10 +97,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i32_factor_2' -; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 @@ -138,25 +138,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i64_factor_2' -; VF_2: Found an estimated cost of 44 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 44 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_2' -; VF_4: Found an estimated cost of 88 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 88 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_2' -; VF_8: Found an estimated cost of 176 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 176 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_2' -; VF_16: Found an estimated cost of 352 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 352 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0 @@ -179,10 +179,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'f16_factor_2' -; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2 +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2 ; VF_4-LABEL: Checking a loop in 'f16_factor_2' ; VF_4: Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2 @@ -261,25 +261,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'f64_factor_2' -; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_2' -; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_2' -; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_2' -; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 0 @@ -306,33 +306,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i8_factor_3' -; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 36 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_3' -; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 72 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_8-LABEL: Checking a loop in 'i8_factor_3' -; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_3' -; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 288 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0 @@ -358,33 +358,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i16_factor_3' -; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 36 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_3' -; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 72 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_3' -; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_3' -; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 288 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.3, ptr %data, i64 %i, i32 0 @@ -410,33 +410,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i32_factor_3' -; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 36 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_3' -; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp5, ptr %tmp2, align 4 ; VF_8-LABEL: Checking a loop in 'i32_factor_3' -; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_3' -; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 288 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.3, ptr %data, i64 %i, i32 0 @@ -462,33 +462,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i64_factor_3' -; VF_2: Found an estimated cost of 66 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 66 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_3' -; VF_4: Found an estimated cost of 132 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 132 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_3' -; VF_8: Found an estimated cost of 264 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 264 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_3' -; VF_16: Found an estimated cost of 528 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 528 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.3, ptr %data, i64 %i, i32 0 @@ -514,12 +514,12 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'f16_factor_3' -; VF_2: Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2 +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2 ; VF_4-LABEL: Checking a loop in 'f16_factor_3' ; VF_4: Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, ptr %tmp1, align 2 @@ -573,12 +573,12 @@ entry: ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float %tmp4, ptr %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 16 for VF 2 For instruction: store float %tmp5, ptr %tmp2, align 4 ; VF_4-LABEL: Checking a loop in 'f32_factor_3' -; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load float, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load float, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp3, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp4, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store float %tmp5, ptr %tmp2, align 4 +; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load float, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load float, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp3, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp4, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp5, ptr %tmp2, align 4 ; VF_8-LABEL: Checking a loop in 'f32_factor_3' ; VF_8: Found an estimated cost of 64 for VF 8 For instruction: %tmp3 = load float, ptr %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load float, ptr %tmp1, align 4 @@ -618,33 +618,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'f64_factor_3' -; VF_2: Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_3' -; VF_4: Found an estimated cost of 36 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 36 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_3' -; VF_8: Found an estimated cost of 72 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 72 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_3' -; VF_16: Found an estimated cost of 144 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 144 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.3, ptr %data, i64 %i, i32 0 @@ -673,41 +673,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i8_factor_4' -; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 48 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_4' -; VF_4: Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 96 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_8-LABEL: Checking a loop in 'i8_factor_4' -; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 192 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_4' -; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 384 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0 @@ -736,41 +736,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i16_factor_4' -; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 48 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_4' -; VF_4: Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 96 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_4' -; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 192 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_4' -; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 384 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.4, ptr %data, i64 %i, i32 0 @@ -799,41 +799,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i32_factor_4' -; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 48 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_4' -; VF_4: Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp7, ptr %tmp3, align 4 ; VF_8-LABEL: Checking a loop in 'i32_factor_4' -; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 192 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_4' -; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 384 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.4, ptr %data, i64 %i, i32 0 @@ -862,41 +862,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i64_factor_4' -; VF_2: Found an estimated cost of 88 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 88 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_4' -; VF_4: Found an estimated cost of 176 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 176 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_4' -; VF_8: Found an estimated cost of 352 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 352 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_4' -; VF_16: Found an estimated cost of 704 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 704 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.4, ptr %data, i64 %i, i32 0 @@ -997,14 +997,14 @@ entry: ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float %tmp6, ptr %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 20 for VF 2 For instruction: store float %tmp7, ptr %tmp3, align 4 ; VF_4-LABEL: Checking a loop in 'f32_factor_4' -; VF_4: Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load float, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, ptr %tmp3, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp4, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp5, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp6, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store float %tmp7, ptr %tmp3, align 4 +; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load float, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp6 = load float, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp7 = load float, ptr %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp4, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp5, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp6, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp7, ptr %tmp3, align 4 ; VF_8-LABEL: Checking a loop in 'f32_factor_4' ; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp4 = load float, ptr %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, ptr %tmp1, align 4 @@ -1051,41 +1051,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'f64_factor_4' -; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp4 = load double, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_4' -; VF_4: Found an estimated cost of 48 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 48 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_4' -; VF_8: Found an estimated cost of 96 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 96 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_4' -; VF_16: Found an estimated cost of 192 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 192 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.4, ptr %data, i64 %i, i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index 6d309c4..df02cb7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -176,7 +176,7 @@ declare i16 @llvm.umax.i16(i16, i16) ; Test case for https://github.com/llvm/llvm-project/issues/106780. define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 { ; CHECK-LABEL: define i32 @cost_of_exit_branch_and_cond_insts( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i1 [[C:%.*]], i16 [[X:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i1 [[C:%.*]], i16 [[X:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[X]] to i32 ; CHECK-NEXT: [[UMAX3:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP0]], i32 111) @@ -404,6 +404,95 @@ exit: ret void } +; Test for https://github.com/llvm/llvm-project/issues/108098. +define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %src, ptr noalias %dst) #0 { +; CHECK-LABEL: define void @gather_interleave_group_with_dead_insert_pos( +; CHECK-SAME: i64 [[N:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) +; CHECK-NEXT: [[TMP0:%.*]] = add nuw i64 [[SMAX]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 16, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <32 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> +; CHECK-NEXT: [[TMP11:%.*]] = zext <8 x i8> [[STRIDED_VEC4]] to <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[STRIDED_VEC5]] to <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST]], <8 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST]], <8 x i64> [[STEP_ADD]] +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP11]], <8 x ptr> [[TMP13]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP12]], <8 x ptr> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L_DEAD:%.*]] = load i8, ptr [[GEP_SRC_0]], align 1 +; CHECK-NEXT: [[IV_1:%.*]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]] +; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[L_1]] to i32 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i32 [[EXT]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 4 +; CHECK-NEXT: [[EC:%.*]] = icmp slt i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %gep.src.0 = getelementptr i8, ptr %src, i64 %iv + %l.dead = load i8, ptr %gep.src.0, align 1 + %iv.1 = add i64 %iv, 1 + %gep.src.1 = getelementptr i8, ptr %src, i64 %iv.1 + %l.1 = load i8, ptr %gep.src.1, align 1 + %ext = zext i8 %l.1 to i32 + %gep.dst = getelementptr i32, ptr %dst, i64 %iv + store i32 %ext, ptr %gep.dst, align 4 + %iv.next = add nsw i64 %iv, 4 + %ec = icmp slt i64 %iv, %N + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +attributes #0 = { "target-features"="+64bit,+v" } + ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -425,4 +514,6 @@ exit: ; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]} ; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} ; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]} +; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} +; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll index 88eb9c4..13c443c 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll @@ -6,7 +6,8 @@ ; Check cost function for <8 x i128> store interleave group. ; CHECK: LV: Checking a loop in 'fun' -; CHECK: LV: Found an estimated cost of 8 for VF 4 For instruction: store i128 8721036757475490113 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i128 8721036757475490113 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i128 8721036757475490113 define noundef i32 @fun(i32 %argc, ptr nocapture readnone %argv) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll new file mode 100644 index 0000000..3d2c2e5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s + +; Check that we correctly handle the use of %start2 in the exit block, and do +; not crash. + +define i64 @foo(ptr %p1, ptr %p2, i64 %start, i64 %end) { +; CHECK-LABEL: define i64 @foo( +; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]], i64 [[START:%.*]], i64 [[END:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[START2:%.*]] = and i64 [[START]], 12345 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END]], [[START2]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[START2]], [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[START2]], [[INDEX]] +; CHECK-NEXT: [[IND:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START2]], %[[ENTRY]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IND1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IND_NEXT1:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND1]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND1]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[IND_NEXT1]] = add i64 [[IND1]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IND_NEXT1]], [[END]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[USE:%.*]] = phi i64 [ [[START2]], %[[FOR_BODY]] ], [ [[START2]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[USE]] +; +entry: + %start2 = and i64 %start, 12345 + br label %for.body + +for.body: + %ind = phi i64 [ %start2, %entry ], [ %ind.next, %for.body ] + %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %ind + %0 = load i32, ptr %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %ind + %1 = load i32, ptr %arrayidx2, align 4 + %ind.next = add i64 %ind, 1 + %cmp = icmp ne i64 %ind.next, %end + br i1 %cmp, label %for.body, label %exit + +exit: + %use = phi i64 [ %start2, %for.body ] + ret i64 %use +} + +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll index 7959e4d..56ed92e 100644 --- a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll +++ b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll @@ -4,6 +4,7 @@ ; There is no scenario currently of doing ctx profile use without thinlto. ; ; RUN: opt -passes='thinlto-pre-link<O2>' -use-ctx-profile=something_that_does_not_exist %s -S | FileCheck %s +; RUN: opt -debug-info-for-profiling -passes='thinlto-pre-link<O2>' -use-ctx-profile=something_that_does_not_exist %s -S | FileCheck %s declare void @bar() diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp index d28af85..a3a62f0 100644 --- a/llvm/tools/llvm-dis/llvm-dis.cpp +++ b/llvm/tools/llvm-dis/llvm-dis.cpp @@ -11,7 +11,23 @@ // llvm-dis [options] x.bc - Read LLVM bitcode from the x.bc file, write asm // to the x.ll file. // Options: -// --help - Output information about command line switches +// +// Color Options: +// --color - Use colors in output (default=autodetect) +// +// Disassembler Options: +// -f - Enable binary output on terminals +// --materialize-metadata - Load module without materializing metadata, +// then materialize only the metadata +// -o <filename> - Override output filename +// --show-annotations - Add informational comments to the .ll file +// +// Generic Options: +// --help - Display available options +// (--help-hidden for more) +// --help-list - Display list of available options +// (--help-list-hidden for more) +// --version - Display the version of this program // //===----------------------------------------------------------------------===// diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index b76d24d..148afd9 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -1354,14 +1354,18 @@ define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) { auto *BB = &*F->begin(); auto It = BB->begin(); auto *Select = cast<sandboxir::SelectInst>(&*It++); + const auto *ConstSelect = Select; // To test the const getters. auto *Ret = &*It++; // Check getCondition(). EXPECT_EQ(Select->getCondition(), Cond0); + EXPECT_EQ(ConstSelect->getCondition(), Cond0); // Check getTrueValue(). EXPECT_EQ(Select->getTrueValue(), V0); + EXPECT_EQ(ConstSelect->getTrueValue(), V0); // Check getFalseValue(). EXPECT_EQ(Select->getFalseValue(), V1); + EXPECT_EQ(ConstSelect->getFalseValue(), V1); // Check setCondition(). Select->setCondition(Cond1); EXPECT_EQ(Select->getCondition(), Cond1); @@ -1371,6 +1375,13 @@ define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) { // Check setFalseValue(). Select->setFalseValue(V0); EXPECT_EQ(Select->getFalseValue(), V0); + // Check swapValues(). + Select->swapValues(); + EXPECT_EQ(Select->getTrueValue(), V0); + EXPECT_EQ(Select->getFalseValue(), V1); + // Check areInvalidOperands. + EXPECT_EQ(sandboxir::SelectInst::areInvalidOperands(Cond0, V0, V1), nullptr); + EXPECT_NE(sandboxir::SelectInst::areInvalidOperands(V0, V1, Cond0), nullptr); { // Check SelectInst::create() InsertBefore. diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index a1f39fe..a1a4117 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -964,6 +964,32 @@ define void @foo(i32 %cond0, i32 %cond1) { EXPECT_EQ(Switch->findCaseDest(BB1), One); } +TEST_F(TrackerTest, SelectInst) { + parseIR(C, R"IR( +define void @foo(i1 %c0, i8 %v0, i8 %v1) { + %sel = select i1 %c0, i8 %v0, i8 %v1 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + sandboxir::Function *F = Ctx.createFunction(LLVMF); + auto *V0 = F->getArg(1); + auto *V1 = F->getArg(2); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *Select = cast<sandboxir::SelectInst>(&*It++); + + // Check tracking for swapValues. + Ctx.save(); + Select->swapValues(); + EXPECT_EQ(Select->getTrueValue(), V1); + EXPECT_EQ(Select->getFalseValue(), V0); + Ctx.revert(); + EXPECT_EQ(Select->getTrueValue(), V0); + EXPECT_EQ(Select->getFalseValue(), V1); +} + TEST_F(TrackerTest, ShuffleVectorInst) { parseIR(C, R"IR( define void @foo(<2 x i8> %v1, <2 x i8> %v2) { diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp index c5849b6..ef51864 100644 --- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp +++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp @@ -504,7 +504,7 @@ struct MatchableInfo { /// TheDef - This is the definition of the instruction or InstAlias that this /// matchable came from. - Record *const TheDef; + const Record *const TheDef; // ResInstSize - The size of the resulting instruction for this matchable. unsigned ResInstSize; @@ -762,7 +762,7 @@ public: RegisterClassesTy RegisterClasses; /// Map of Predicate records to their subtarget information. - std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures; + SubtargetFeatureInfoMap SubtargetFeatures; /// Map of AsmOperandClass records to their class information. std::map<const Record *, ClassInfo *> AsmOperandClasses; @@ -1338,7 +1338,7 @@ void AsmMatcherInfo::buildRegisterClasses( // Name the register classes which correspond to a user defined RegisterClass. for (const CodeGenRegisterClass &RC : RegClassList) { // Def will be NULL for non-user defined register classes. - Record *Def = RC.getDef(); + const Record *Def = RC.getDef(); if (!Def) continue; ClassInfo *CI = RegisterSetClasses[RegisterSet(RC.getOrder().begin(), @@ -1513,8 +1513,8 @@ void AsmMatcherInfo::buildOperandMatchInfo() { void AsmMatcherInfo::buildInfo() { // Build information about all of the AssemblerPredicates. - const std::vector<std::pair<Record *, SubtargetFeatureInfo>> - &SubtargetFeaturePairs = SubtargetFeatureInfo::getAll(Records); + SubtargetFeaturesInfoVec SubtargetFeaturePairs = + SubtargetFeatureInfo::getAll(Records); SubtargetFeatures.insert(SubtargetFeaturePairs.begin(), SubtargetFeaturePairs.end()); #ifndef NDEBUG @@ -3226,9 +3226,9 @@ static void emitMatchClassKindNames(std::forward_list<ClassInfo> &Infos, } static std::string -getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) { +getNameForFeatureBitset(ArrayRef<const Record *> FeatureBitset) { std::string Name = "AMFBS"; - for (const auto &Feature : FeatureBitset) + for (const Record *Feature : FeatureBitset) Name += ("_" + Feature->getName()).str(); return Name; } @@ -3451,7 +3451,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { StringTable.EmitString(OS); OS << ";\n\n"; - std::vector<std::vector<Record *>> FeatureBitsets; + std::vector<std::vector<const Record *>> FeatureBitsets; for (const auto &MI : Info.Matchables) { if (MI->RequiredFeatures.empty()) continue; @@ -3460,8 +3460,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) { FeatureBitsets.back().push_back(MI->RequiredFeatures[I]->TheDef); } - llvm::sort(FeatureBitsets, [&](const std::vector<Record *> &A, - const std::vector<Record *> &B) { + llvm::sort(FeatureBitsets, [&](const std::vector<const Record *> &A, + const std::vector<const Record *> &B) { if (A.size() < B.size()) return true; if (A.size() > B.size()) diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp index 88acd79..69ca9a8 100644 --- a/llvm/utils/TableGen/CodeEmitterGen.cpp +++ b/llvm/utils/TableGen/CodeEmitterGen.cpp @@ -403,7 +403,7 @@ void CodeEmitterGen::emitInstructionBaseValues( << HWM.getModeName(HwMode, /*IncludeDefault=*/true) << "[] = {\n"; for (const CodeGenInstruction *CGI : NumberedInstructions) { - Record *R = CGI->TheDef; + const Record *R = CGI->TheDef; if (R->getValueAsString("Namespace") == "TargetOpcode" || R->getValueAsBit("isPseudo")) { @@ -485,7 +485,7 @@ void CodeEmitterGen::run(raw_ostream &o) { std::set<unsigned> HwModes; BitWidth = 0; for (const CodeGenInstruction *CGI : NumberedInstructions) { - Record *R = CGI->TheDef; + const Record *R = CGI->TheDef; if (R->getValueAsString("Namespace") == "TargetOpcode" || R->getValueAsBit("isPseudo")) continue; diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp index fbf1d47..46aad7f 100644 --- a/llvm/utils/TableGen/CodeGenMapTable.cpp +++ b/llvm/utils/TableGen/CodeGenMapTable.cpp @@ -185,7 +185,7 @@ private: // KeyInstrVec - list of key instructions. std::vector<Record *> KeyInstrVec; - DenseMap<Record *, std::vector<Record *>> MapTable; + DenseMap<const Record *, std::vector<Record *>> MapTable; public: MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec) @@ -371,7 +371,7 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) { // emitted as first column. OS << "Table[][" << NumCol + 1 << "] = {\n"; for (unsigned i = 0; i < TotalNumInstr; i++) { - Record *CurInstr = NumberedInstructions[i]->TheDef; + const Record *CurInstr = NumberedInstructions[i]->TheDef; std::vector<Record *> ColInstrs = MapTable[CurInstr]; std::string OutStr; unsigned RelExists = 0; diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp index 4582478..a77e247 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp @@ -1255,28 +1255,28 @@ bool TreePredicateFn::isAtomicOrderingWeakerThanRelease() const { false); } Record *TreePredicateFn::getMemoryVT() const { - Record *R = getOrigPatFragRecord()->getRecord(); + const Record *R = getOrigPatFragRecord()->getRecord(); if (R->isValueUnset("MemoryVT")) return nullptr; return R->getValueAsDef("MemoryVT"); } ListInit *TreePredicateFn::getAddressSpaces() const { - Record *R = getOrigPatFragRecord()->getRecord(); + const Record *R = getOrigPatFragRecord()->getRecord(); if (R->isValueUnset("AddressSpaces")) return nullptr; return R->getValueAsListInit("AddressSpaces"); } int64_t TreePredicateFn::getMinAlignment() const { - Record *R = getOrigPatFragRecord()->getRecord(); + const Record *R = getOrigPatFragRecord()->getRecord(); if (R->isValueUnset("MinAlignment")) return 0; return R->getValueAsInt("MinAlignment"); } Record *TreePredicateFn::getScalarMemoryVT() const { - Record *R = getOrigPatFragRecord()->getRecord(); + const Record *R = getOrigPatFragRecord()->getRecord(); if (R->isValueUnset("ScalarMemoryVT")) return nullptr; return R->getValueAsDef("ScalarMemoryVT"); @@ -1390,7 +1390,7 @@ std::string TreePredicateFn::getCodeToRunOnSDNode() const { if (Tree->isLeaf()) TreeClassName = "SDNode"; else { - Record *Op = Tree->getOperator(); + const Record *Op = Tree->getOperator(); const SDNodeInfo &Info = PatFragRec->getDAGPatterns().getSDNodeInfo(Op); TreeClassName = Info.getSDClassName(); } @@ -1848,7 +1848,8 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const { // TreePatternNode implementation // -static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) { +static unsigned GetNumNodeResults(const Record *Operator, + CodeGenDAGPatterns &CDP) { if (Operator->getName() == "set" || Operator->getName() == "implicit") return 0; // All return nothing. @@ -2077,7 +2078,7 @@ void TreePatternNode::InlinePatternFragments( return; } - Record *Op = getOperator(); + const Record *Op = getOperator(); if (!Op->isSubClassOf("PatFrags")) { if (getNumChildren() == 0) { @@ -2340,7 +2341,7 @@ TreePatternNode::getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const { /// return the ComplexPattern information, otherwise return null. const ComplexPattern * TreePatternNode::getComplexPatternInfo(const CodeGenDAGPatterns &CGP) const { - Record *Rec; + const Record *Rec; if (isLeaf()) { DefInit *DI = dyn_cast<DefInit>(getLeafValue()); if (!DI) @@ -2793,7 +2794,7 @@ bool TreePatternNode::canPatternMatch(std::string &Reason, // TreePattern implementation // -TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput, +TreePattern::TreePattern(const Record *TheRec, ListInit *RawPat, bool isInput, CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false), Infer(*this) { @@ -2801,15 +2802,15 @@ TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput, Trees.push_back(ParseTreePattern(I, "")); } -TreePattern::TreePattern(Record *TheRec, DagInit *Pat, bool isInput, +TreePattern::TreePattern(const Record *TheRec, DagInit *Pat, bool isInput, CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false), Infer(*this) { Trees.push_back(ParseTreePattern(Pat, "")); } -TreePattern::TreePattern(Record *TheRec, TreePatternNodePtr Pat, bool isInput, - CodeGenDAGPatterns &cdp) +TreePattern::TreePattern(const Record *TheRec, TreePatternNodePtr Pat, + bool isInput, CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false), Infer(*this) { Trees.push_back(Pat); @@ -3389,7 +3390,7 @@ static bool HandleUse(TreePattern &I, TreePatternNodePtr Pat, return false; } - Record *Rec; + const Record *Rec; if (Pat->isLeaf()) { DefInit *DI = dyn_cast<DefInit>(Pat->getLeafValue()); if (!DI) @@ -3408,7 +3409,7 @@ static bool HandleUse(TreePattern &I, TreePatternNodePtr Pat, Slot = Pat; return true; } - Record *SlotRec; + const Record *SlotRec; if (Slot->isLeaf()) { SlotRec = cast<DefInit>(Slot->getLeafValue())->getDef(); } else { @@ -3633,7 +3634,8 @@ public: }; static bool InferFromPattern(CodeGenInstruction &InstInfo, - const InstAnalyzer &PatInfo, Record *PatDef) { + const InstAnalyzer &PatInfo, + const Record *PatDef) { bool Error = false; // Remember where InstInfo got its flags. @@ -3729,7 +3731,7 @@ static bool hasNullFragReference(ListInit *LI) { /// Get all the instructions in a tree. static void getInstructionsInTree(TreePatternNode &Tree, - SmallVectorImpl<Record *> &Instrs) { + SmallVectorImpl<const Record *> &Instrs) { if (Tree.isLeaf()) return; if (Tree.getOperator()->isSubClassOf("Instruction")) @@ -3935,8 +3937,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI, // Create and insert the instruction. // FIXME: InstImpResults should not be part of DAGInstruction. - Record *R = I.getRecord(); - DAGInsts.try_emplace(R, std::move(Results), std::move(Operands), + DAGInsts.try_emplace(I.getRecord(), std::move(Results), std::move(Operands), std::move(InstImpResults), SrcPattern, ResultPattern); LLVM_DEBUG(I.dump()); @@ -3989,9 +3990,7 @@ void CodeGenDAGPatterns::ParseInstructions() { } // If we can, convert the instructions to be patterns that are matched! - for (auto &Entry : Instructions) { - Record *Instr = Entry.first; - DAGInstruction &TheInst = Entry.second; + for (const auto &[Instr, TheInst] : Instructions) { TreePatternNodePtr SrcPattern = TheInst.getSrcPattern(); TreePatternNodePtr ResultPattern = TheInst.getResultPattern(); @@ -4078,7 +4077,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() { for (const PatternToMatch &PTM : ptms()) { // We can only infer from single-instruction patterns, otherwise we won't // know which instruction should get the flags. - SmallVector<Record *, 8> PatInstrs; + SmallVector<const Record *, 8> PatInstrs; getInstructionsInTree(PTM.getDstPattern(), PatInstrs); if (PatInstrs.size() != 1) continue; @@ -4135,7 +4134,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() { void CodeGenDAGPatterns::VerifyInstructionFlags() { unsigned Errors = 0; for (const PatternToMatch &PTM : ptms()) { - SmallVector<Record *, 8> Instrs; + SmallVector<const Record *, 8> Instrs; getInstructionsInTree(PTM.getDstPattern(), Instrs); if (Instrs.empty()) continue; @@ -4245,7 +4244,7 @@ static TreePatternNodePtr PromoteXForms(TreePatternNodePtr N) { } void CodeGenDAGPatterns::ParseOnePattern( - Record *TheDef, TreePattern &Pattern, TreePattern &Result, + const Record *TheDef, TreePattern &Pattern, TreePattern &Result, const std::vector<Record *> &InstImpResults, bool ShouldIgnore) { // Inline pattern fragments and expand multiple alternatives. @@ -4591,7 +4590,7 @@ GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N, std::vector<TreePatternNodePtr> &Children) { assert(N->getNumChildren() == 2 && "Associative but doesn't have 2 children!"); - Record *Operator = N->getOperator(); + const Record *Operator = N->getOperator(); // Only permit raw nodes. if (!N->getName().empty() || !N->getPredicateCalls().empty() || diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h index 88a5437..4dc08e6 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h @@ -634,7 +634,7 @@ class TreePatternNode : public RefCountedBase<TreePatternNode> { /// OperatorOrVal - The Record for the operator if this is an interior node /// (not a leaf) or the init value (e.g. the "GPRC" record, or "7") for a /// leaf. - PointerUnion<Record *, Init *> OperatorOrVal; + PointerUnion<const Record *, Init *> OperatorOrVal; /// Name - The name given to this node with the :$foo notation. /// @@ -657,7 +657,7 @@ class TreePatternNode : public RefCountedBase<TreePatternNode> { const Record *GISelFlags = nullptr; public: - TreePatternNode(Record *Op, std::vector<TreePatternNodePtr> Ch, + TreePatternNode(const Record *Op, std::vector<TreePatternNodePtr> Ch, unsigned NumResults) : OperatorOrVal(Op), TransformFn(nullptr), Children(std::move(Ch)) { Types.resize(NumResults); @@ -717,9 +717,9 @@ public: assert(isLeaf()); return cast<Init *>(OperatorOrVal); } - Record *getOperator() const { + const Record *getOperator() const { assert(!isLeaf()); - return cast<Record *>(OperatorOrVal); + return cast<const Record *>(OperatorOrVal); } unsigned getNumChildren() const { return Children.size(); } @@ -878,7 +878,7 @@ class TreePattern { /// TheRecord - The actual TableGen record corresponding to this pattern. /// - Record *TheRecord; + const Record *TheRecord; /// Args - This is a list of all of the arguments to this pattern (for /// PatFrag patterns), which are the 'node' markers in this pattern. @@ -908,11 +908,11 @@ class TreePattern { public: /// TreePattern constructor - Parse the specified DagInits into the /// current record. - TreePattern(Record *TheRec, ListInit *RawPat, bool isInput, + TreePattern(const Record *TheRec, ListInit *RawPat, bool isInput, CodeGenDAGPatterns &ise); - TreePattern(Record *TheRec, DagInit *Pat, bool isInput, + TreePattern(const Record *TheRec, DagInit *Pat, bool isInput, CodeGenDAGPatterns &ise); - TreePattern(Record *TheRec, TreePatternNodePtr Pat, bool isInput, + TreePattern(const Record *TheRec, TreePatternNodePtr Pat, bool isInput, CodeGenDAGPatterns &ise); /// getTrees - Return the tree patterns which corresponds to this pattern. @@ -935,7 +935,7 @@ public: /// getRecord - Return the actual TableGen record corresponding to this /// pattern. /// - Record *getRecord() const { return TheRecord; } + const Record *getRecord() const { return TheRecord; } unsigned getNumArgs() const { return Args.size(); } const std::string &getArgName(unsigned i) const { @@ -1054,7 +1054,7 @@ public: /// PatternToMatch - Used by CodeGenDAGPatterns to keep tab of patterns /// processed to produce isel. class PatternToMatch { - Record *SrcRecord; // Originating Record for the pattern. + const Record *SrcRecord; // Originating Record for the pattern. ListInit *Predicates; // Top level predicate conditions to match. TreePatternNodePtr SrcPattern; // Source pattern to match. TreePatternNodePtr DstPattern; // Resulting pattern. @@ -1065,16 +1065,16 @@ class PatternToMatch { unsigned ID; // Unique ID for the record. public: - PatternToMatch(Record *srcrecord, ListInit *preds, TreePatternNodePtr src, - TreePatternNodePtr dst, std::vector<Record *> dstregs, - int complexity, unsigned uid, bool ignore, - const Twine &hwmodefeatures = "") + PatternToMatch(const Record *srcrecord, ListInit *preds, + TreePatternNodePtr src, TreePatternNodePtr dst, + std::vector<Record *> dstregs, int complexity, unsigned uid, + bool ignore, const Twine &hwmodefeatures = "") : SrcRecord(srcrecord), Predicates(preds), SrcPattern(src), DstPattern(dst), Dstregs(std::move(dstregs)), HwModeFeatures(hwmodefeatures.str()), AddedComplexity(complexity), GISelShouldIgnore(ignore), ID(uid) {} - Record *getSrcRecord() const { return SrcRecord; } + const Record *getSrcRecord() const { return SrcRecord; } ListInit *getPredicates() const { return Predicates; } TreePatternNode &getSrcPattern() const { return *SrcPattern; } TreePatternNodePtr getSrcPatternShared() const { return SrcPattern; } @@ -1099,14 +1099,14 @@ class CodeGenDAGPatterns { CodeGenTarget Target; CodeGenIntrinsicTable Intrinsics; - std::map<Record *, SDNodeInfo, LessRecordByID> SDNodes; - std::map<Record *, std::pair<Record *, std::string>, LessRecordByID> + std::map<const Record *, SDNodeInfo, LessRecordByID> SDNodes; + std::map<const Record *, std::pair<Record *, std::string>, LessRecordByID> SDNodeXForms; - std::map<Record *, ComplexPattern, LessRecordByID> ComplexPatterns; - std::map<Record *, std::unique_ptr<TreePattern>, LessRecordByID> + std::map<const Record *, ComplexPattern, LessRecordByID> ComplexPatterns; + std::map<const Record *, std::unique_ptr<TreePattern>, LessRecordByID> PatternFragments; std::map<const Record *, DAGDefaultOperand, LessRecordByID> DefaultOperands; - std::map<Record *, DAGInstruction, LessRecordByID> Instructions; + std::map<const Record *, DAGInstruction, LessRecordByID> Instructions; // Specific SDNode definitions: Record *intrinsic_void_sdnode; @@ -1134,7 +1134,7 @@ public: Record *getSDNodeNamed(StringRef Name) const; - const SDNodeInfo &getSDNodeInfo(Record *R) const { + const SDNodeInfo &getSDNodeInfo(const Record *R) const { auto F = SDNodes.find(R); assert(F != SDNodes.end() && "Unknown node!"); return F->second; @@ -1142,19 +1142,19 @@ public: // Node transformation lookups. typedef std::pair<Record *, std::string> NodeXForm; - const NodeXForm &getSDNodeTransform(Record *R) const { + const NodeXForm &getSDNodeTransform(const Record *R) const { auto F = SDNodeXForms.find(R); assert(F != SDNodeXForms.end() && "Invalid transform!"); return F->second; } - const ComplexPattern &getComplexPattern(Record *R) const { + const ComplexPattern &getComplexPattern(const Record *R) const { auto F = ComplexPatterns.find(R); assert(F != ComplexPatterns.end() && "Unknown addressing mode!"); return F->second; } - const CodeGenIntrinsic &getIntrinsic(Record *R) const { + const CodeGenIntrinsic &getIntrinsic(const Record *R) const { for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i) if (Intrinsics[i].TheDef == R) return Intrinsics[i]; @@ -1181,20 +1181,19 @@ public: } // Pattern Fragment information. - TreePattern *getPatternFragment(Record *R) const { + TreePattern *getPatternFragment(const Record *R) const { auto F = PatternFragments.find(R); assert(F != PatternFragments.end() && "Invalid pattern fragment request!"); return F->second.get(); } - TreePattern *getPatternFragmentIfRead(Record *R) const { + TreePattern *getPatternFragmentIfRead(const Record *R) const { auto F = PatternFragments.find(R); if (F == PatternFragments.end()) return nullptr; return F->second.get(); } - typedef std::map<Record *, std::unique_ptr<TreePattern>, - LessRecordByID>::const_iterator pf_iterator; + using pf_iterator = decltype(PatternFragments)::const_iterator; pf_iterator pf_begin() const { return PatternFragments.begin(); } pf_iterator pf_end() const { return PatternFragments.end(); } iterator_range<pf_iterator> ptfs() const { return PatternFragments; } @@ -1206,11 +1205,11 @@ public: iterator_range<ptm_iterator> ptms() const { return PatternsToMatch; } /// Parse the Pattern for an instruction, and insert the result in DAGInsts. - typedef std::map<Record *, DAGInstruction, LessRecordByID> DAGInstMap; + typedef std::map<const Record *, DAGInstruction, LessRecordByID> DAGInstMap; void parseInstructionPattern(CodeGenInstruction &CGI, ListInit *Pattern, DAGInstMap &DAGInsts); - const DAGInstruction &getInstruction(Record *R) const { + const DAGInstruction &getInstruction(const Record *R) const { auto F = Instructions.find(R); assert(F != Instructions.end() && "Unknown instruction!"); return F->second; @@ -1244,7 +1243,7 @@ private: void GenerateVariants(); void VerifyInstructionFlags(); - void ParseOnePattern(Record *TheDef, TreePattern &Pattern, + void ParseOnePattern(const Record *TheDef, TreePattern &Pattern, TreePattern &Result, const std::vector<Record *> &InstImpResults, bool ShouldIgnore = false); diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp index 1cc217b..8d698fa 100644 --- a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp +++ b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp @@ -298,7 +298,8 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) { return std::pair(0U, 0U); } -static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, Record *Rec) { +static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, + const Record *Rec) { // EARLY_CLOBBER: @early $reg StringRef::size_type wpos = CStr.find_first_of(" \t"); StringRef::size_type start = CStr.find_first_not_of(" \t"); @@ -391,7 +392,8 @@ static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, Record *Rec) { Ops[SrcOp.first].Constraints[SrcOp.second] = NewConstraint; } -static void ParseConstraints(StringRef CStr, CGIOperandList &Ops, Record *Rec) { +static void ParseConstraints(StringRef CStr, CGIOperandList &Ops, + const Record *Rec) { if (CStr.empty()) return; @@ -428,7 +430,7 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) { // CodeGenInstruction Implementation //===----------------------------------------------------------------------===// -CodeGenInstruction::CodeGenInstruction(Record *R) +CodeGenInstruction::CodeGenInstruction(const Record *R) : TheDef(R), Operands(R), InferredFrom(nullptr) { Namespace = R->getValueAsString("Namespace"); AsmString = std::string(R->getValueAsString("AsmString")); @@ -501,7 +503,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R) HasComplexDeprecationPredicate = true; DeprecatedReason = std::string(R->getValueAsString("ComplexDeprecationPredicate")); - } else if (RecordVal *Dep = R->getValue("DeprecatedFeatureMask")) { + } else if (const RecordVal *Dep = R->getValue("DeprecatedFeatureMask")) { // Check if we have a Subtarget feature mask. HasComplexDeprecationPredicate = false; DeprecatedReason = Dep->getValue()->getAsString(); diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h index f4af0e8..3d4360f 100644 --- a/llvm/utils/TableGen/Common/CodeGenInstruction.h +++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h @@ -222,8 +222,8 @@ public: class CodeGenInstruction { public: - Record *TheDef; // The actual record defining this instruction. - StringRef Namespace; // The namespace the instruction is in. + const Record *TheDef; // The actual record defining this instruction. + StringRef Namespace; // The namespace the instruction is in. /// AsmString - The format string used to emit a .s file for the /// instruction. @@ -297,12 +297,12 @@ public: // The record used to infer instruction flags, or NULL if no flag values // have been inferred. - Record *InferredFrom; + const Record *InferredFrom; // The enum value assigned by CodeGenTarget::computeInstrsByEnum. mutable unsigned EnumVal = 0; - CodeGenInstruction(Record *R); + CodeGenInstruction(const Record *R); /// HasOneImplicitDefWithKnownVT - If the instruction has at least one /// implicit def and it has a known VT, return the VT, otherwise return diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index cadc242..5b43f7d 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -34,7 +34,6 @@ #include <iterator> #include <map> #include <queue> -#include <set> #include <string> #include <tuple> #include <utility> @@ -48,7 +47,7 @@ using namespace llvm; // CodeGenSubRegIndex //===----------------------------------------------------------------------===// -CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum, +CodeGenSubRegIndex::CodeGenSubRegIndex(const Record *R, unsigned Enum, const CodeGenHwModes &CGH) : TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) { Name = std::string(R->getName()); @@ -99,7 +98,7 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) { PrintFatalError(TheDef->getLoc(), "CoveredBySubRegs must have two or more entries"); SmallVector<CodeGenSubRegIndex *, 8> IdxParts; - for (Record *Part : Parts) + for (const Record *Part : Parts) IdxParts.push_back(RegBank.getSubRegIdx(Part)); setConcatenationOf(IdxParts); } @@ -190,8 +189,7 @@ void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) { // Add ad hoc alias links. This is a symmetric relationship between two // registers, so build a symmetric graph by adding links in both ends. - std::vector<Record *> Aliases = TheDef->getValueAsListOfDefs("Aliases"); - for (Record *Alias : Aliases) { + for (const Record *Alias : TheDef->getValueAsListOfDefs("Aliases")) { CodeGenRegister *Reg = RegBank.getReg(Alias); ExplicitAliases.push_back(Reg); Reg->ExplicitAliases.push_back(this); @@ -757,7 +755,8 @@ static void sortAndUniqueRegisters(CodeGenRegister::Vec &M) { M.erase(llvm::unique(M, deref<std::equal_to<>>()), M.end()); } -CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R) +CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, + const Record *R) : TheDef(R), Name(std::string(R->getName())), TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1), TSFlags(0) { GeneratePressureSet = R->getValueAsBit("GeneratePressureSet"); @@ -765,7 +764,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R) if (TypeList.empty()) PrintFatalError(R->getLoc(), "RegTypes list must not be empty!"); for (unsigned i = 0, e = TypeList.size(); i != e; ++i) { - Record *Type = TypeList[i]; + const Record *Type = TypeList[i]; if (!Type->isSubClassOf("ValueType")) PrintFatalError(R->getLoc(), "RegTypes list member '" + Type->getName() + @@ -1168,9 +1167,9 @@ void CodeGenRegisterClass::buildRegUnitSet( //===----------------------------------------------------------------------===// CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank, - Record *R) + const Record *R) : TheDef(R), Name(std::string(R->getName())) { - for (Record *RegClass : R->getValueAsListOfDefs("Classes")) + for (const Record *RegClass : R->getValueAsListOfDefs("Classes")) Classes.push_back(RegBank.getRegClass(RegClass)); } @@ -1178,7 +1177,7 @@ CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank, // CodeGenRegBank //===----------------------------------------------------------------------===// -CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records, +CodeGenRegBank::CodeGenRegBank(const RecordKeeper &Records, const CodeGenHwModes &Modes) : CGH(Modes) { // Configure register Sets to understand register classes and tuples. @@ -1189,10 +1188,8 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records, // Read in the user-defined (named) sub-register indices. // More indices will be synthesized later. - std::vector<Record *> SRIs = Records.getAllDerivedDefinitions("SubRegIndex"); - llvm::sort(SRIs, LessRecord()); - for (unsigned i = 0, e = SRIs.size(); i != e; ++i) - getSubRegIdx(SRIs[i]); + for (const Record *SRI : Records.getAllDerivedDefinitions("SubRegIndex")) + getSubRegIdx(SRI); // Build composite maps from ComposedOf fields. for (auto &Idx : SubRegIndices) Idx.updateComponents(*this); @@ -1223,7 +1220,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records, getReg(Regs[i]); // Expand tuples and number the new registers. - for (Record *R : Records.getAllDerivedDefinitions("RegisterTuples")) { + for (const Record *R : Records.getAllDerivedDefinitions("RegisterTuples")) { std::vector<const Record *> TupRegs = *Sets.expand(R); llvm::sort(TupRegs, LessRecordRegister()); for (const Record *RC : TupRegs) @@ -1288,7 +1285,8 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records, NumNativeRegUnits = RegUnits.size(); // Read in register class definitions. - std::vector<Record *> RCs = Records.getAllDerivedDefinitions("RegisterClass"); + ArrayRef<const Record *> RCs = + Records.getAllDerivedDefinitions("RegisterClass"); if (RCs.empty()) PrintFatalError("No 'RegisterClass' subclasses defined!"); @@ -1311,9 +1309,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records, CodeGenRegisterClass::computeSubClasses(*this); // Read in the register category definitions. - std::vector<Record *> RCats = - Records.getAllDerivedDefinitions("RegisterCategory"); - for (auto *R : RCats) + for (const Record *R : Records.getAllDerivedDefinitions("RegisterCategory")) RegCategories.emplace_back(*this, R); } @@ -1324,7 +1320,7 @@ CodeGenSubRegIndex *CodeGenRegBank::createSubRegIndex(StringRef Name, return &SubRegIndices.back(); } -CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(Record *Def) { +CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(const Record *Def) { CodeGenSubRegIndex *&Idx = Def2SubRegIdx[Def]; if (Idx) return Idx; @@ -2450,7 +2446,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() { /// return null. If the register is in multiple classes, and the classes have a /// superset-subset relationship and the same set of types, return the /// superclass. Otherwise return null. -const CodeGenRegisterClass *CodeGenRegBank::getRegClassForRegister(Record *R) { +const CodeGenRegisterClass * +CodeGenRegBank::getRegClassForRegister(const Record *R) { const CodeGenRegister *Reg = getReg(R); const CodeGenRegisterClass *FoundRC = nullptr; for (const auto &RC : getRegClasses()) { @@ -2490,7 +2487,7 @@ const CodeGenRegisterClass *CodeGenRegBank::getRegClassForRegister(Record *R) { } const CodeGenRegisterClass * -CodeGenRegBank::getMinimalPhysRegClass(Record *RegRecord, +CodeGenRegBank::getMinimalPhysRegClass(const Record *RegRecord, ValueTypeByHwMode *VT) { const CodeGenRegister *Reg = getReg(RegRecord); const CodeGenRegisterClass *BestRC = nullptr; diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h index f0f53d3..2fa6cab 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.h +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h @@ -63,7 +63,7 @@ struct MaskRolPair { /// CodeGenSubRegIndex - Represents a sub-register index. class CodeGenSubRegIndex { - Record *const TheDef; + const Record *const TheDef; std::string Name; std::string Namespace; @@ -85,7 +85,7 @@ public: // indexes are not used to create new register classes. bool Artificial; - CodeGenSubRegIndex(Record *R, unsigned Enum, const CodeGenHwModes &CGH); + CodeGenSubRegIndex(const Record *R, unsigned Enum, const CodeGenHwModes &CGH); CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum); CodeGenSubRegIndex(CodeGenSubRegIndex &) = delete; @@ -320,7 +320,7 @@ class CodeGenRegisterClass { // List of super-classes, topologocally ordered to have the larger classes // first. This is the same as sorting by EnumValue. SmallVector<CodeGenRegisterClass *, 4> SuperClasses; - Record *TheDef; + const Record *TheDef; std::string Name; // For a synthesized class, inherit missing properties from the nearest @@ -368,7 +368,7 @@ public: // Return the Record that defined this class, or NULL if the class was // created by TableGen. - Record *getDef() const { return TheDef; } + const Record *getDef() const { return TheDef; } std::string getNamespaceQualification() const; const std::string &getName() const { return Name; } @@ -473,7 +473,7 @@ public: void buildRegUnitSet(const CodeGenRegBank &RegBank, std::vector<unsigned> &RegUnits) const; - CodeGenRegisterClass(CodeGenRegBank &, Record *R); + CodeGenRegisterClass(CodeGenRegBank &, const Record *R); CodeGenRegisterClass(CodeGenRegisterClass &) = delete; // A key representing the parts of a register class used for forming @@ -511,17 +511,17 @@ public: // register falls into (GPR, vector, fixed, etc.) without having to know // specific information about the target architecture. class CodeGenRegisterCategory { - Record *TheDef; + const Record *TheDef; std::string Name; std::list<CodeGenRegisterClass *> Classes; public: - CodeGenRegisterCategory(CodeGenRegBank &, Record *R); + CodeGenRegisterCategory(CodeGenRegBank &, const Record *R); CodeGenRegisterCategory(CodeGenRegisterCategory &) = delete; // Return the Record that defined this class, or NULL if the class was // created by TableGen. - Record *getDef() const { return TheDef; } + const Record *getDef() const { return TheDef; } std::string getName() const { return Name; } std::list<CodeGenRegisterClass *> getClasses() const { return Classes; } @@ -585,7 +585,7 @@ class CodeGenRegBank { const CodeGenHwModes &CGH; std::deque<CodeGenSubRegIndex> SubRegIndices; - DenseMap<Record *, CodeGenSubRegIndex *> Def2SubRegIdx; + DenseMap<const Record *, CodeGenSubRegIndex *> Def2SubRegIdx; CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace); @@ -612,7 +612,6 @@ class CodeGenRegBank { // Register categories. std::list<CodeGenRegisterCategory> RegCategories; - DenseMap<Record *, CodeGenRegisterCategory *> Def2RCat; using RCatKeyMap = std::map<CodeGenRegisterClass::Key, CodeGenRegisterCategory *>; RCatKeyMap Key2RCat; @@ -677,7 +676,7 @@ class CodeGenRegBank { void computeRegUnitLaneMasks(); public: - CodeGenRegBank(RecordKeeper &, const CodeGenHwModes &); + CodeGenRegBank(const RecordKeeper &, const CodeGenHwModes &); CodeGenRegBank(CodeGenRegBank &) = delete; SetTheory &getSets() { return Sets; } @@ -693,7 +692,7 @@ public: // Find a SubRegIndex from its Record def or add to the list if it does // not exist there yet. - CodeGenSubRegIndex *getSubRegIdx(Record *); + CodeGenSubRegIndex *getSubRegIdx(const Record *); // Find a SubRegIndex from its Record def. const CodeGenSubRegIndex *findSubRegIdx(const Record *Def) const; @@ -785,14 +784,15 @@ public: /// class, return null. If the register is in multiple classes, and the /// classes have a superset-subset relationship and the same set of types, /// return the superclass. Otherwise return null. - const CodeGenRegisterClass *getRegClassForRegister(Record *R); + const CodeGenRegisterClass *getRegClassForRegister(const Record *R); // Analog of TargetRegisterInfo::getMinimalPhysRegClass. Unlike // getRegClassForRegister, this tries to find the smallest class containing // the physical register. If \p VT is specified, it will only find classes // with a matching type const CodeGenRegisterClass * - getMinimalPhysRegClass(Record *RegRecord, ValueTypeByHwMode *VT = nullptr); + getMinimalPhysRegClass(const Record *RegRecord, + ValueTypeByHwMode *VT = nullptr); // Get the sum of unit weights. unsigned getRegUnitSetWeight(const std::vector<unsigned> &Units) const { diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp index 0a48fea..3dcfdc9 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp @@ -609,7 +609,7 @@ void CodeGenSchedModels::collectSchedRW() { // Find all SchedReadWrites referenced by instruction defs. RecVec SWDefs, SRDefs; for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) { - Record *SchedDef = Inst->TheDef; + const Record *SchedDef = Inst->TheDef; if (SchedDef->isValueUnset("SchedRW")) continue; RecVec RWs = SchedDef->getValueAsListOfDefs("SchedRW"); diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.h b/llvm/utils/TableGen/Common/DAGISelMatcher.h index 81a5e3e..49dc472 100644 --- a/llvm/utils/TableGen/Common/DAGISelMatcher.h +++ b/llvm/utils/TableGen/Common/DAGISelMatcher.h @@ -983,14 +983,14 @@ private: /// recorded node and records the result. class EmitNodeXFormMatcher : public Matcher { unsigned Slot; - Record *NodeXForm; + const Record *NodeXForm; public: - EmitNodeXFormMatcher(unsigned slot, Record *nodeXForm) + EmitNodeXFormMatcher(unsigned slot, const Record *nodeXForm) : Matcher(EmitNodeXForm), Slot(slot), NodeXForm(nodeXForm) {} unsigned getSlot() const { return Slot; } - Record *getNodeXForm() const { return NodeXForm; } + const Record *getNodeXForm() const { return NodeXForm; } static bool classof(const Matcher *N) { return N->getKind() == EmitNodeXForm; diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp index 139bf2d..0779b1e 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp @@ -822,7 +822,7 @@ SaveAndRestore<GISelFlags> RuleMatcher::setGISelFlags(const Record *R) { } Error RuleMatcher::defineComplexSubOperand(StringRef SymbolicName, - Record *ComplexPattern, + const Record *ComplexPattern, unsigned RendererID, unsigned SubOperandID, StringRef ParentSymbolicName) { diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h index 5b517b1..94f26d8 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h @@ -523,7 +523,7 @@ protected: ArrayRef<SMLoc> SrcLoc; - typedef std::tuple<Record *, unsigned, unsigned> + typedef std::tuple<const Record *, unsigned, unsigned> DefinedComplexPatternSubOperand; typedef StringMap<DefinedComplexPatternSubOperand> DefinedComplexPatternSubOperandMap; @@ -649,7 +649,8 @@ public: void definePhysRegOperand(Record *Reg, OperandMatcher &OM); - Error defineComplexSubOperand(StringRef SymbolicName, Record *ComplexPattern, + Error defineComplexSubOperand(StringRef SymbolicName, + const Record *ComplexPattern, unsigned RendererID, unsigned SubOperandID, StringRef ParentSymbolicName); diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.cpp b/llvm/utils/TableGen/Common/InfoByHwMode.cpp index 7425e7f..e5e8225 100644 --- a/llvm/utils/TableGen/Common/InfoByHwMode.cpp +++ b/llvm/utils/TableGen/Common/InfoByHwMode.cpp @@ -28,7 +28,8 @@ std::string llvm::getModeName(unsigned Mode) { return (Twine('m') + Twine(Mode)).str(); } -ValueTypeByHwMode::ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH) { +ValueTypeByHwMode::ValueTypeByHwMode(const Record *R, + const CodeGenHwModes &CGH) { const HwModeSelect &MS = CGH.getHwModeSelect(R); for (const HwModeSelect::PairType &P : MS.Items) { auto I = Map.insert({P.first, MVT(llvm::getValueType(P.second))}); @@ -39,7 +40,8 @@ ValueTypeByHwMode::ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH) { PtrAddrSpace = R->getValueAsInt("AddrSpace"); } -ValueTypeByHwMode::ValueTypeByHwMode(Record *R, MVT T) : ValueTypeByHwMode(T) { +ValueTypeByHwMode::ValueTypeByHwMode(const Record *R, MVT T) + : ValueTypeByHwMode(T) { if (R->isSubClassOf("PtrValueType")) PtrAddrSpace = R->getValueAsInt("AddrSpace"); } @@ -102,7 +104,7 @@ void ValueTypeByHwMode::writeToStream(raw_ostream &OS) const { LLVM_DUMP_METHOD void ValueTypeByHwMode::dump() const { dbgs() << *this << '\n'; } -ValueTypeByHwMode llvm::getValueTypeByHwMode(Record *Rec, +ValueTypeByHwMode llvm::getValueTypeByHwMode(const Record *Rec, const CodeGenHwModes &CGH) { #ifndef NDEBUG if (!Rec->isSubClassOf("ValueType")) diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.h b/llvm/utils/TableGen/Common/InfoByHwMode.h index a6edf3c..4f11e8e 100644 --- a/llvm/utils/TableGen/Common/InfoByHwMode.h +++ b/llvm/utils/TableGen/Common/InfoByHwMode.h @@ -152,8 +152,8 @@ protected: }; struct ValueTypeByHwMode : public InfoByHwMode<MVT> { - ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH); - ValueTypeByHwMode(Record *R, MVT T); + ValueTypeByHwMode(const Record *R, const CodeGenHwModes &CGH); + ValueTypeByHwMode(const Record *R, MVT T); ValueTypeByHwMode(MVT T) { Map.insert({DefaultMode, T}); } ValueTypeByHwMode() = default; @@ -174,7 +174,8 @@ struct ValueTypeByHwMode : public InfoByHwMode<MVT> { } }; -ValueTypeByHwMode getValueTypeByHwMode(Record *Rec, const CodeGenHwModes &CGH); +ValueTypeByHwMode getValueTypeByHwMode(const Record *Rec, + const CodeGenHwModes &CGH); raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T); diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp index a4d6d8d..738ddf7 100644 --- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp +++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp @@ -20,12 +20,10 @@ LLVM_DUMP_METHOD void SubtargetFeatureInfo::dump() const { } #endif -std::vector<std::pair<Record *, SubtargetFeatureInfo>> -SubtargetFeatureInfo::getAll(RecordKeeper &Records) { - std::vector<std::pair<Record *, SubtargetFeatureInfo>> SubtargetFeatures; - std::vector<Record *> AllPredicates = - Records.getAllDerivedDefinitions("Predicate"); - for (Record *Pred : AllPredicates) { +SubtargetFeaturesInfoVec +SubtargetFeatureInfo::getAll(const RecordKeeper &Records) { + SubtargetFeaturesInfoVec SubtargetFeatures; + for (const Record *Pred : Records.getAllDerivedDefinitions("Predicate")) { // Ignore predicates that are not intended for the assembler. // // The "AssemblerMatcherPredicate" string should be promoted to an argument diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h index fee2c02..d75a9a4 100644 --- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h +++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h @@ -19,18 +19,20 @@ namespace llvm { struct SubtargetFeatureInfo; using SubtargetFeatureInfoMap = - std::map<Record *, SubtargetFeatureInfo, LessRecordByID>; + std::map<const Record *, SubtargetFeatureInfo, LessRecordByID>; +using SubtargetFeaturesInfoVec = + std::vector<std::pair<const Record *, SubtargetFeatureInfo>>; /// Helper class for storing information on a subtarget feature which /// participates in instruction matching. struct SubtargetFeatureInfo { /// The predicate record for this feature. - Record *TheDef; + const Record *TheDef; /// An unique index assigned to represent this feature. uint64_t Index; - SubtargetFeatureInfo(Record *D, uint64_t Idx) : TheDef(D), Index(Idx) {} + SubtargetFeatureInfo(const Record *D, uint64_t Idx) : TheDef(D), Index(Idx) {} /// The name of the enumerated constant identifying this feature. std::string getEnumName() const { @@ -48,8 +50,8 @@ struct SubtargetFeatureInfo { } void dump() const; - static std::vector<std::pair<Record *, SubtargetFeatureInfo>> - getAll(RecordKeeper &Records); + + static SubtargetFeaturesInfoVec getAll(const RecordKeeper &Records); /// Emit the subtarget feature flag definitions. /// diff --git a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp index 049bd37..ce4cd35 100644 --- a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp +++ b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp @@ -77,7 +77,7 @@ class VarLenCodeEmitterGen { // name suffix to improve readability of the generated code. std::map<AltEncodingTy, std::string> Modes; - DenseMap<Record *, DenseMap<AltEncodingTy, VarLenInst>> VarLenInsts; + DenseMap<const Record *, DenseMap<AltEncodingTy, VarLenInst>> VarLenInsts; // Emit based values (i.e. fixed bits in the encoded instructions) void emitInstructionBaseValues( @@ -227,7 +227,7 @@ void VarLenCodeEmitterGen::run(raw_ostream &OS) { auto NumberedInstructions = Target.getInstructionsByEnumValue(); for (const CodeGenInstruction *CGI : NumberedInstructions) { - Record *R = CGI->TheDef; + const Record *R = CGI->TheDef; // Create the corresponding VarLenInst instance. if (R->getValueAsString("Namespace") == "TargetOpcode" || R->getValueAsBit("isPseudo")) @@ -249,7 +249,7 @@ void VarLenCodeEmitterGen::run(raw_ostream &OS) { continue; } } - RecordVal *RV = R->getValue("Inst"); + const RecordVal *RV = R->getValue("Inst"); DagInit *DI = cast<DagInit>(RV->getValue()); VarLenInsts[R].insert({Universal, VarLenInst(DI, RV)}); } @@ -356,7 +356,7 @@ void VarLenCodeEmitterGen::emitInstructionBaseValues( unsigned NumFixedValueWords = 0U; for (const CodeGenInstruction *CGI : NumberedInstructions) { - Record *R = CGI->TheDef; + const Record *R = CGI->TheDef; if (R->getValueAsString("Namespace") == "TargetOpcode" || R->getValueAsBit("isPseudo")) { diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp index b43a8e6..6c72103 100644 --- a/llvm/utils/TableGen/DAGISelEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelEmitter.cpp @@ -47,7 +47,7 @@ static unsigned getResultPatternCost(TreePatternNode &P, return 0; unsigned Cost = 0; - Record *Op = P.getOperator(); + const Record *Op = P.getOperator(); if (Op->isSubClassOf("Instruction")) { Cost++; CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op); @@ -67,7 +67,7 @@ static unsigned getResultPatternSize(TreePatternNode &P, return 0; unsigned Cost = 0; - Record *Op = P.getOperator(); + const Record *Op = P.getOperator(); if (Op->isSubClassOf("Instruction")) { Cost += Op->getValueAsInt("CodeSize"); } diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp index a14cc3d..96a40f0 100644 --- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp @@ -64,8 +64,8 @@ class MatcherTableEmitter { std::vector<const ComplexPattern *> ComplexPatterns; - DenseMap<Record *, unsigned> NodeXFormMap; - std::vector<Record *> NodeXForms; + DenseMap<const Record *, unsigned> NodeXFormMap; + std::vector<const Record *> NodeXForms; std::vector<std::string> VecIncludeStrings; MapVector<std::string, unsigned, StringMap<unsigned>> VecPatterns; @@ -203,7 +203,7 @@ private: return llvm::find(ComplexPatterns, &P) - ComplexPatterns.begin(); } - unsigned getNodeXFormID(Record *Rec) { + unsigned getNodeXFormID(const Record *Rec) { unsigned &Entry = NodeXFormMap[Rec]; if (Entry == 0) { NodeXForms.push_back(Rec); @@ -930,7 +930,7 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N, GetPatFromTreePatternNode(SNT->getPattern().getSrcPattern()); std::string dst = GetPatFromTreePatternNode(SNT->getPattern().getDstPattern()); - Record *PatRecord = SNT->getPattern().getSrcRecord(); + const Record *PatRecord = SNT->getPattern().getSrcRecord(); std::string include_src = getIncludePath(PatRecord); unsigned Offset = getPatternIdxFromTable(src + " -> " + dst, std::move(include_src)); @@ -1043,7 +1043,7 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N, GetPatFromTreePatternNode(CM->getPattern().getSrcPattern()); std::string dst = GetPatFromTreePatternNode(CM->getPattern().getDstPattern()); - Record *PatRecord = CM->getPattern().getSrcRecord(); + const Record *PatRecord = CM->getPattern().getSrcRecord(); std::string include_src = getIncludePath(PatRecord); unsigned Offset = getPatternIdxFromTable(src + " -> " + dst, std::move(include_src)); diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp index 4e65690..bb8f4dc 100644 --- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp @@ -746,7 +746,7 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode &N, static bool mayInstNodeLoadOrStore(const TreePatternNode &N, const CodeGenDAGPatterns &CGP) { - Record *Op = N.getOperator(); + const Record *Op = N.getOperator(); const CodeGenTarget &CGT = CGP.getTargetInfo(); CodeGenInstruction &II = CGT.getInstruction(Op); return II.mayLoad || II.mayStore; @@ -757,7 +757,7 @@ static unsigned numNodesThatMayLoadOrStore(const TreePatternNode &N, if (N.isLeaf()) return 0; - Record *OpRec = N.getOperator(); + const Record *OpRec = N.getOperator(); if (!OpRec->isSubClassOf("Instruction")) return 0; @@ -773,7 +773,7 @@ static unsigned numNodesThatMayLoadOrStore(const TreePatternNode &N, void MatcherGen::EmitResultInstructionAsOperand( const TreePatternNode &N, SmallVectorImpl<unsigned> &OutputOps) { - Record *Op = N.getOperator(); + const Record *Op = N.getOperator(); const CodeGenTarget &CGT = CGP.getTargetInfo(); CodeGenInstruction &II = CGT.getInstruction(Op); const DAGInstruction &Inst = CGP.getInstruction(Op); @@ -1010,7 +1010,7 @@ void MatcherGen::EmitResultOperand(const TreePatternNode &N, if (N.isLeaf()) return EmitResultLeafAsOperand(N, ResultOps); - Record *OpRec = N.getOperator(); + const Record *OpRec = N.getOperator(); if (OpRec->isSubClassOf("Instruction")) return EmitResultInstructionAsOperand(N, ResultOps); if (OpRec->isSubClassOf("SDNodeXForm")) diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp index 2ef98b3..01df873 100644 --- a/llvm/utils/TableGen/FastISelEmitter.cpp +++ b/llvm/utils/TableGen/FastISelEmitter.cpp @@ -236,7 +236,7 @@ struct OperandsSignature { // not needed and just bloat the fast instruction selector. For // example, X86 doesn't need to generate code to match ADD16ri8 since // ADD16ri will do just fine. - Record *Rec = PredFn.getOrigPatFragRecord()->getRecord(); + const Record *Rec = PredFn.getOrigPatFragRecord()->getRecord(); if (Rec->getValueAsBit("FastIselShouldIgnore")) return false; @@ -417,7 +417,7 @@ private: }; } // End anonymous namespace -static std::string getOpcodeName(Record *Op, CodeGenDAGPatterns &CGP) { +static std::string getOpcodeName(const Record *Op, CodeGenDAGPatterns &CGP) { return std::string(CGP.getSDNodeInfo(Op).getEnumName()); } @@ -461,7 +461,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { TreePatternNode &Dst = Pattern.getDstPattern(); if (Dst.isLeaf()) continue; - Record *Op = Dst.getOperator(); + const Record *Op = Dst.getOperator(); if (!Op->isSubClassOf("Instruction")) continue; CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op); @@ -524,7 +524,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) { if (InstPatNode.getNumTypes() > 1) continue; - Record *InstPatOp = InstPatNode.getOperator(); + const Record *InstPatOp = InstPatNode.getOperator(); std::string OpcodeName = getOpcodeName(InstPatOp, CGP); MVT::SimpleValueType RetVT = MVT::isVoid; if (InstPatNode.getNumTypes()) diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index b2f4d32..d82f1c3 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -120,7 +120,7 @@ static std::string explainPredicates(const TreePatternNode &N) { if (P.isTruncStore()) Explanation += " truncstore"; - if (Record *VT = P.getMemoryVT()) + if (const Record *VT = P.getMemoryVT()) Explanation += (" MemVT=" + VT->getName()).str(); if (Record *VT = P.getScalarMemoryVT()) Explanation += (" ScalarVT(MemVT)=" + VT->getName()).str(); @@ -168,7 +168,7 @@ static std::string explainPredicates(const TreePatternNode &N) { return Explanation; } -std::string explainOperator(Record *Operator) { +std::string explainOperator(const Record *Operator) { if (Operator->isSubClassOf("SDNode")) return (" (" + Operator->getValueAsString("Opcode") + ")").str(); @@ -346,7 +346,7 @@ private: /// SDNodes to the GINodeEquiv mapping. We need to map to the GINodeEquiv to /// check for attributes on the relation such as CheckMMOIsNonAtomic. /// This is defined using 'GINodeEquiv' in the target description. - DenseMap<Record *, Record *> NodeEquivs; + DenseMap<const Record *, Record *> NodeEquivs; /// Keep track of the equivalence between ComplexPattern's and /// GIComplexOperandMatcher. Map entries are specified by subclassing @@ -379,7 +379,7 @@ private: void gatherTypeIDValues(); void gatherNodeEquivs(); - Record *findNodeEquiv(Record *N) const; + Record *findNodeEquiv(const Record *N) const; const CodeGenInstruction *getEquivNode(Record &Equiv, const TreePatternNode &N) const; @@ -388,7 +388,7 @@ private: createAndImportSelDAGMatcher(RuleMatcher &Rule, InstructionMatcher &InsnMatcher, const TreePatternNode &Src, unsigned &TempOpIdx); - Error importComplexPatternOperandMatcher(OperandMatcher &OM, Record *R, + Error importComplexPatternOperandMatcher(OperandMatcher &OM, const Record *R, unsigned &TempOpIdx) const; Error importChildMatcher(RuleMatcher &Rule, InstructionMatcher &InsnMatcher, const TreePatternNode &SrcChild, @@ -504,7 +504,7 @@ void GlobalISelEmitter::gatherNodeEquivs() { } } -Record *GlobalISelEmitter::findNodeEquiv(Record *N) const { +Record *GlobalISelEmitter::findNodeEquiv(const Record *N) const { return NodeEquivs.lookup(N); } @@ -928,7 +928,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher( } Error GlobalISelEmitter::importComplexPatternOperandMatcher( - OperandMatcher &OM, Record *R, unsigned &TempOpIdx) const { + OperandMatcher &OM, const Record *R, unsigned &TempOpIdx) const { const auto &ComplexPattern = ComplexPatternEquivs.find(R); if (ComplexPattern == ComplexPatternEquivs.end()) return failedImport("SelectionDAG ComplexPattern (" + R->getName() + @@ -1508,7 +1508,7 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer( Expected<action_iterator> GlobalISelEmitter::createInstructionRenderer( action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst) { - Record *DstOp = Dst.getOperator(); + const Record *DstOp = Dst.getOperator(); if (!DstOp->isSubClassOf("Instruction")) { if (DstOp->isSubClassOf("ValueType")) return failedImport( @@ -1813,7 +1813,7 @@ GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode &N) { // just take the first one). if (N.getNumTypes() < 1) return std::nullopt; - Record *OpRec = N.getOperator(); + const Record *OpRec = N.getOperator(); // We only want instructions. if (!OpRec->isSubClassOf("Instruction")) @@ -2011,7 +2011,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { } // Start with the defined operands (i.e., the results of the root operator). - Record *DstOp = Dst.getOperator(); + const Record *DstOp = Dst.getOperator(); if (!DstOp->isSubClassOf("Instruction")) return failedImport("Pattern operator isn't an instruction"); diff --git a/llvm/utils/TableGen/InstrDocsEmitter.cpp b/llvm/utils/TableGen/InstrDocsEmitter.cpp index f948540..f53428e 100644 --- a/llvm/utils/TableGen/InstrDocsEmitter.cpp +++ b/llvm/utils/TableGen/InstrDocsEmitter.cpp @@ -73,7 +73,7 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) { OS << "\n"; for (const CodeGenInstruction *II : Target.getInstructionsByEnumValue()) { - Record *Inst = II->TheDef; + const Record *Inst = II->TheDef; // Don't print the target-independent instructions. if (II->Namespace == "TargetOpcode") diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index 5fd5914..4e2138d 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -721,9 +721,9 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS, } static std::string -getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) { +getNameForFeatureBitset(ArrayRef<const Record *> FeatureBitset) { std::string Name = "CEFBS"; - for (const auto &Feature : FeatureBitset) + for (const Record *Feature : FeatureBitset) Name += ("_" + Feature->getName()).str(); return Name; } @@ -731,7 +731,7 @@ getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) { void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS, const CodeGenTarget &Target) { const auto &All = SubtargetFeatureInfo::getAll(Records); - std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures; + SubtargetFeatureInfoMap SubtargetFeatures; SubtargetFeatures.insert(All.begin(), All.end()); OS << "#if (defined(ENABLE_INSTR_PREDICATE_VERIFIER) && !defined(NDEBUG)) " @@ -752,18 +752,19 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS, SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures( Target.getName(), "", "computeAvailableFeatures", SubtargetFeatures, OS); - std::vector<std::vector<Record *>> FeatureBitsets; + std::vector<std::vector<const Record *>> FeatureBitsets; for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) { FeatureBitsets.emplace_back(); - for (Record *Predicate : Inst->TheDef->getValueAsListOfDefs("Predicates")) { + for (const Record *Predicate : + Inst->TheDef->getValueAsListOfDefs("Predicates")) { const auto &I = SubtargetFeatures.find(Predicate); if (I != SubtargetFeatures.end()) FeatureBitsets.back().push_back(I->second.TheDef); } } - llvm::sort(FeatureBitsets, [&](const std::vector<Record *> &A, - const std::vector<Record *> &B) { + llvm::sort(FeatureBitsets, [&](const std::vector<const Record *> &A, + const std::vector<const Record *> &B) { if (A.size() < B.size()) return true; if (A.size() > B.size()) @@ -806,7 +807,8 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS, for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) { OS << " CEFBS"; unsigned NumPredicates = 0; - for (Record *Predicate : Inst->TheDef->getValueAsListOfDefs("Predicates")) { + for (const Record *Predicate : + Inst->TheDef->getValueAsListOfDefs("Predicates")) { const auto &I = SubtargetFeatures.find(Predicate); if (I != SubtargetFeatures.end()) { OS << '_' << I->second.TheDef->getName(); @@ -890,7 +892,8 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS, void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS, StringRef TargetName, bool ExpandDefinition) { - RecVec TIIPredicates = Records.getAllDerivedDefinitions("TIIPredicate"); + ArrayRef<const Record *> TIIPredicates = + Records.getAllDerivedDefinitions("TIIPredicate"); if (TIIPredicates.empty()) return; diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp index 0fc930b..8952c8e 100644 --- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp @@ -446,7 +446,7 @@ void X86FoldTablesEmitter::addEntryWithFlags(FoldTable &Table, assert((IsManual || Table.find(RegInst) == Table.end()) && "Override entry unexpectedly"); X86FoldTableEntry Result = X86FoldTableEntry(RegInst, MemInst); - Record *RegRec = RegInst->TheDef; + const Record *RegRec = RegInst->TheDef; Result.NoReverse = S & TB_NO_REVERSE; Result.NoForward = S & TB_NO_FORWARD; Result.FoldLoad = S & TB_FOLDED_LOAD; @@ -537,8 +537,8 @@ void X86FoldTablesEmitter::updateTables(const CodeGenInstruction *RegInst, uint16_t S, bool IsManual, bool IsBroadcast) { - Record *RegRec = RegInst->TheDef; - Record *MemRec = MemInst->TheDef; + const Record *RegRec = RegInst->TheDef; + const Record *MemRec = MemInst->TheDef; unsigned MemOutSize = MemRec->getValueAsDag("OutOperandList")->getNumArgs(); unsigned RegOutSize = RegRec->getValueAsDag("OutOperandList")->getNumArgs(); unsigned MemInSize = MemRec->getValueAsDag("InOperandList")->getNumArgs(); diff --git a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn index fe8a3f5..acf9e7a 100644 --- a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn @@ -35,6 +35,7 @@ static_library("Target") { sources = [ "ABI.cpp", "AssertFrameRecognizer.cpp", + "CoreFileMemoryRanges.cpp", "DynamicRegisterInfo.cpp", "ExecutionContext.cpp", "InstrumentationRuntime.cpp", diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h index 2fda091..aceb9d0 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h @@ -152,7 +152,7 @@ public: /// This function adds a DENY entry. void denyDialect(StringRef dialectNamespace) { Entry::FilterFn filterFn = [=](Operation *op) { - return op->getDialect()->getNamespace() == dialectNamespace; + return op->getName().getDialectNamespace() == dialectNamespace; }; entries.push_back(Entry{filterFn, Entry::FilterType::DENY}); } diff --git a/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt index 12b4fc4..1ee105f 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt @@ -3,8 +3,8 @@ add_mlir_doc(TosaOps TosaOps Dialects/ -gen-op-doc) add_mlir_interface(TosaInterfaces) set(LLVM_TARGET_DEFINITIONS TosaOps.td) -mlir_tablegen(TosaAttributes.h.inc -gen-attrdef-decls) -mlir_tablegen(TosaAttributes.cpp.inc -gen-attrdef-defs) +mlir_tablegen(TosaAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=tosa) +mlir_tablegen(TosaAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=tosa) add_public_tablegen_target(MLIRTosaAttributesIncGen) set(LLVM_TARGET_DEFINITIONS TosaDialectBytecode.td) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index ab6daa3..63572f2 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -73,7 +73,6 @@ def Tosa_AvgPool2dOp : Tosa_InferShapedTypeOp<"avg_pool2d"> { let arguments = (ins Tosa_Tensor4D:$input, - Tosa_IntArrayAttr2:$kernel, Tosa_IntArrayAttr2:$stride, Tosa_IntArrayAttr4:$pad, @@ -102,9 +101,8 @@ def Tosa_Conv2DOp : Tosa_InferShapedTypeOp<"conv2d"> { let arguments = (ins Tosa_Tensor4D:$input, - 4DTensorOf<[Tosa_Weight]>:$weight, + TosaTensorRankOf<[Tosa_Weight], [4]>:$weight, Tosa_Tensor1D:$bias, - Tosa_IntArrayAttr4:$pad, Tosa_IntArrayAttr2:$stride, Tosa_IntArrayAttr2:$dilation, @@ -132,9 +130,8 @@ def Tosa_Conv3DOp : Tosa_InferShapedTypeOp<"conv3d"> { let arguments = (ins Tosa_Tensor5D:$input, - TensorRankOf<[Tosa_Weight], [5]>:$weight, + TosaTensorRankOf<[Tosa_Weight], [5]>:$weight, Tosa_Tensor1D:$bias, - Tosa_IntArrayAttr6:$pad, Tosa_IntArrayAttr3:$stride, Tosa_IntArrayAttr3:$dilation, @@ -163,9 +160,8 @@ def Tosa_DepthwiseConv2DOp : Tosa_InferShapedTypeOp<"depthwise_conv2d"> { let arguments = (ins Tosa_Tensor4D:$input, - 4DTensorOf<[Tosa_Weight]>:$weight, + TosaTensorRankOf<[Tosa_Weight], [4]>:$weight, Tosa_Tensor1D:$bias, - Tosa_IntArrayAttr4:$pad, Tosa_IntArrayAttr2:$stride, Tosa_IntArrayAttr2:$dilation, @@ -232,7 +228,7 @@ def Tosa_FullyConnectedOp : Tosa_InferShapedTypeOp<"fully_connected"> { let arguments = (ins Tosa_Tensor2D:$input, - 2DTensorOf<[Tosa_Weight]>:$weight, + TosaTensorRankOf<[Tosa_Weight], [2]>:$weight, Tosa_Tensor1D:$bias, OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info ); @@ -347,9 +343,8 @@ def Tosa_TransposeConv2DOp : Tosa_InferShapedTypeOp<"transpose_conv2d"> { let arguments = (ins Tosa_Tensor4D:$input, - 4DTensorOf<[Tosa_Weight]>:$filter, + TosaTensorRankOf<[Tosa_Weight], [4]>:$filter, Tosa_Tensor1D:$bias, - Tosa_IntArrayAttr4:$out_pad, Tosa_IntArrayAttr2:$stride, Tosa_IntArrayAttrUpto4:$out_shape, @@ -641,12 +636,12 @@ def Tosa_LogicalAndOp : Tosa_ElementwiseOp<"logical_and", [ }]; let arguments = (ins - I1Tensor:$input1, - I1Tensor:$input2 + Tosa_I1Tensor:$input1, + Tosa_I1Tensor:$input2 ); let results = (outs - I1Tensor:$z + Tosa_I1Tensor:$z ); } @@ -708,12 +703,12 @@ def Tosa_LogicalOrOp : Tosa_ElementwiseOp<"logical_or", [ }]; let arguments = (ins - I1Tensor:$input1, - I1Tensor:$input2 + Tosa_I1Tensor:$input1, + Tosa_I1Tensor:$input2 ); let results = (outs - I1Tensor:$z + Tosa_I1Tensor:$z ); } @@ -731,12 +726,12 @@ def Tosa_LogicalXorOp : Tosa_ElementwiseOp<"logical_xor", [ }]; let arguments = (ins - I1Tensor:$input1, - I1Tensor:$input2 + Tosa_I1Tensor:$input1, + Tosa_I1Tensor:$input2 ); let results = (outs - I1Tensor:$z + Tosa_I1Tensor:$z ); } @@ -1085,11 +1080,11 @@ def Tosa_LogicalNotOp : Tosa_ElementwiseOp<"logical_not", }]; let arguments = (ins - I1Tensor:$input1 + Tosa_I1Tensor:$input1 ); let results = (outs - I1Tensor:$output + Tosa_I1Tensor:$output ); } @@ -1208,7 +1203,7 @@ def Tosa_SelectOp : Tosa_ElementwiseOp<"select"> { }]; let arguments = (ins - I1Tensor:$pred, + Tosa_I1Tensor:$pred, Tosa_Tensor:$on_true, Tosa_Tensor:$on_false ); @@ -1249,7 +1244,7 @@ def Tosa_EqualOp : Tosa_ElementwiseOp<"equal", [ ); let results = (outs - I1Tensor:$output + Tosa_I1Tensor:$output ); let extraClassDeclaration = [{ @@ -1277,7 +1272,7 @@ def Tosa_GreaterOp : Tosa_ElementwiseOp<"greater", [SameOperandsElementType]> { ); let results = (outs - I1Tensor:$output + Tosa_I1Tensor:$output ); let hasFolder = 1; @@ -1300,7 +1295,7 @@ def Tosa_GreaterEqualOp : Tosa_ElementwiseOp<"greater_equal", ); let results = (outs - I1Tensor:$output + Tosa_I1Tensor:$output ); let hasFolder = 1; @@ -1721,7 +1716,7 @@ def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose", let arguments = (ins Tosa_Tensor:$input1, - Tosa_Int32Or64Tensor:$perms + Tosa_Int32Tensor:$perms ); let results = ( @@ -1729,7 +1724,7 @@ def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose", ); let extraClassDeclaration = [{ - LogicalResult getConstantPerms(llvm::SmallVector<int64_t> &perms); + LogicalResult getConstantPerms(llvm::SmallVector<int32_t> &perms); }]; let hasCanonicalizer = 1; @@ -1755,7 +1750,7 @@ def Tosa_GatherOp : Tosa_InferShapedTypeOp<"gather"> { let arguments = (ins Tosa_Tensor3D:$values, - 2DTensorOf<[Tosa_Int32]>:$indices + TosaTensorRankOf<[Tosa_Int32], [2]>:$indices ); let results = (outs @@ -1776,7 +1771,7 @@ def Tosa_ScatterOp : Tosa_InferShapedTypeOp<"scatter"> { let arguments = (ins Tosa_Tensor3D:$values_in, - 2DTensorOf<[Tosa_Int32]>:$indices, + TosaTensorRankOf<[Tosa_Int32], [2]>:$indices, Tosa_Tensor3D:$input ); @@ -1947,10 +1942,11 @@ def Tosa_ConstOp : Tosa_Op<"const", [ConstantLike, Pure, ); let results = (outs - TensorOf<[AnyTypeOf<[Tosa_AnyNumber]>]>:$output + TosaTensorOf<[AnyTypeOf<[Tosa_AnyNumber]>]>:$output ); let hasFolder = 1; + let hasVerifier = 1; } //===----------------------------------------------------------------------===// @@ -2054,7 +2050,7 @@ def Tosa_IfOp : Tosa_Op<"cond_if", }]; let arguments = (ins - I1Tensor:$cond, + Tosa_I1Tensor:$cond, Variadic<Tosa_Tensor>:$inputs ); diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td index 14fc9c7..c3a0128 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td @@ -83,57 +83,82 @@ def Tosa_Weight : AnyTypeOf<[Tosa_Int4, Tosa_Int8, Tosa_QuantizedInt, AnyFloat]>; //===----------------------------------------------------------------------===// +// TOSA Tensor Conformance +//===----------------------------------------------------------------------===// + +def HasNo0Dimensions : And<[ + IsRankedTensorTypePred, + CPred<"::llvm::all_of(::llvm::cast<::mlir::RankedTensorType>($_self).getShape(), [](auto v) { return v != 0; })">]>; + +class TosaTensorOf< + list<Type> allowedTypes, string summary = "tosa-conformant tensor"> + : TensorOf<allowedTypes, [Or<[HasNo0Dimensions, IsUnrankedTensorTypePred]>], summary>; + +class TosaRankedTensorOf< + list<Type> allowedTypes, list<Pred> preds = [], string summary = "tosa-conformant ranked tensor"> + : RankedTensorOf<allowedTypes, !listconcat([HasNo0Dimensions], preds), summary>; + +class TosaUnrankedTensorOf<list<Type> allowedTypes, list<Pred> preds = [], string summary = "tosa-conformant unranked tensor"> + : UnrankedTensorOf<allowedTypes, preds, summary>; + +class TosaTensorRankOf<list<Type> allowedTypes, list<int> ranks> + : TosaRankedTensorOf<allowedTypes, + [HasAnyRankOfPred<ranks>], + !interleave(!foreach(rank, ranks, rank # "D"), "/") # " tensor">; + +//===----------------------------------------------------------------------===// // Tensor types //===----------------------------------------------------------------------===// -def Tosa_Int32Tensor : TensorOf<[Tosa_Int32]>; -def Tosa_Int32Or64Tensor : TensorOf<[Tosa_Int32Or64]>; +def Tosa_I1Tensor : TosaTensorOf<[I1]>; +def Tosa_Int32Tensor : TosaTensorOf<[Tosa_Int32]>; +def Tosa_Int32Or64Tensor :TosaTensorOf<[Tosa_Int32Or64]>; -def Tosa_FloatTensor : TensorOf<[AnyFloat]>; +def Tosa_FloatTensor : TosaTensorOf<[AnyFloat]>; // Either ranked or unranked tensor of TOSA supported element types. -def Tosa_Tensor : TensorOf<[Tosa_AnyNumber]>; +def Tosa_Tensor : TosaTensorOf<[Tosa_AnyNumber]>; // Must be ranked but no further constraints -def Tosa_RankedTensor : RankedTensorOf<[Tosa_AnyNumber]>; +def Tosa_RankedTensor : TosaRankedTensorOf<[Tosa_AnyNumber]>; // Any tensor element type allowed in Tosa ops. def Tosa_ElementType : Type<Or<[Tosa_Int.predicate, Tosa_QuantizedInt.predicate, AnyFloat.predicate]>, "tosa.dtype">; class Tosa_TensorOfOrNone<list<Type> allowedTypes, string description = ""> : - AnyTypeOf<[TensorOf<allowedTypes>, NoneType], description>; + AnyTypeOf<[TosaTensorOf<allowedTypes>, NoneType], description>; //===----------------------------------------------------------------------===// // Tensor types with constrained ranks. //===----------------------------------------------------------------------===// // Rank-0 (scalar) tensor -def Tosa_ScalarTensor : TensorRankOf<[Tosa_AnyNumber], [0]>; +def Tosa_ScalarTensor : TosaTensorRankOf<[Tosa_AnyNumber], [0]>; // We include unranked tensors as a supported type for all possible tosa // Tensors as unranked does not guarantee invalid. If unranked tensors exist // they should be shape propagate used Tosa's shape inference pass and verified // to not include any remaining unranked tensors. -def Tosa_UnrankedTensor : UnrankedTensorOf<[Tosa_AnyNumber]>; +def Tosa_UnrankedTensor : TosaUnrankedTensorOf<[Tosa_AnyNumber]>; -def Tosa_Tensor1D : AnyTypeOf<[Tosa_UnrankedTensor, 1DTensorOf<[Tosa_AnyNumber]>], "1-d tensor", "::mlir::TensorType">; -def Tosa_Tensor2D : AnyTypeOf<[Tosa_UnrankedTensor, 2DTensorOf<[Tosa_AnyNumber]>], "2-d tensor", "::mlir::TensorType">; -def Tosa_Tensor3D : AnyTypeOf<[Tosa_UnrankedTensor, 3DTensorOf<[Tosa_AnyNumber]>], "3-d tensor", "::mlir::TensorType">; -def Tosa_Tensor4D : AnyTypeOf<[Tosa_UnrankedTensor, 4DTensorOf<[Tosa_AnyNumber]>], "4-d tensor", "::mlir::TensorType">; -def Tosa_Tensor5D : AnyTypeOf<[Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [5]>], "5-d tensor", "::mlir::TensorType">; +def Tosa_Tensor1D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [1]>], "1-d tosa-conformant tensor", "::mlir::TensorType">; +def Tosa_Tensor2D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [2]>], "2-d tosa-conformant tensor", "::mlir::TensorType">; +def Tosa_Tensor3D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [3]>], "3-d tosa-conformant tensor", "::mlir::TensorType">; +def Tosa_Tensor4D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [4]>], "4-d tosa-conformant tensor", "::mlir::TensorType">; +def Tosa_Tensor5D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [5]>], "5-d tosa-conformant tensor", "::mlir::TensorType">; // Ranked tensors up to given rank. def Tosa_Tensor1Dto4D : AnyTypeOf<[ - Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [1,2,3,4]>]>; + Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [1,2,3,4]>]>; def Tosa_Tensor1Dto6D : AnyTypeOf<[ - Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [1,2,3,4,5,6]>]>; + Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [1,2,3,4,5,6]>]>; def Tosa_TensorUpto4D : AnyTypeOf<[ - Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [0,1,2,3,4]>]>; + Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [0,1,2,3,4]>]>; def Tosa_Int32TensorUpto4D : AnyTypeOf<[ - Tosa_UnrankedTensor, TensorRankOf<[Tosa_Int32], [0,1,2,3,4]>]>; + Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_Int32], [0,1,2,3,4]>]>; //===----------------------------------------------------------------------===// // Generic scalar, vector, or tensor of a particular type. @@ -142,7 +167,7 @@ def Tosa_Int32TensorUpto4D : AnyTypeOf<[ class Tosa_TypeLike<list<Type> types, string description = ""> : TypeConstraint<Or<[ AnyTypeOf<types>.predicate, VectorOf<types>.predicate, - TensorOf<types>.predicate]>, + TosaTensorOf<types>.predicate]>, description>; def Tosa_IntLike : Tosa_TypeLike<[Tosa_Int], "signless-integer-like">; diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h index ef40b34..90fea1f 100644 --- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h +++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h @@ -216,6 +216,19 @@ TosaOp CreateOpAndInferShape(PatternRewriter &rewriter, Location loc, return CreateOpAndInferShape<TosaOp>(builder, resultTy, args...); } +// Apply an int32_t permutation to some input, that should be of the same +// size as perms. Perms should contain some permutation of 0 - perms.size() - 1. +template <typename T> +SmallVector<T> applyTOSAPermutation(ArrayRef<T> input, + ArrayRef<int32_t> perms) { + SmallVector<T> permuted; + size_t N = input.size(); + permuted.resize_for_overwrite(N); + for (size_t i = 0; i < N; i++) + permuted[i] = input[perms[i]]; + return permuted; +} + } // namespace tosa } // namespace mlir diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 93e8b08..2992671 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -329,10 +329,9 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); target.addLegalDialect<ROCDL::ROCDLDialect>(); target.addIllegalDialect<gpu::GPUDialect>(); - target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp, - LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, - LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp, - LLVM::SqrtOp>(); + target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp, + LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op, + LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>(); // TODO: Remove once we support replacing non-root ops. target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>(); diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp index 77c3d2e..fe53b49 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp @@ -313,7 +313,7 @@ public: // convolution operation. // TODO(suderman): See if this can be efficiently folded - check whether // the input is used anywhere else, if not fold the constant. - SmallVector<int64_t> weightPerm; + SmallVector<int32_t> weightPerm; for (int i = 1; i < resultTy.getRank(); i++) weightPerm.push_back(i); weightPerm.push_back(0); @@ -321,7 +321,7 @@ public: SmallVector<int64_t> newWeightShape; for (auto dim : weightPerm) newWeightShape.push_back(weightShape[dim]); - auto weightPermAttr = rewriter.getI64TensorAttr(weightPerm); + auto weightPermAttr = rewriter.getI32TensorAttr(weightPerm); Value weightPermValue = rewriter.create<arith::ConstantOp>(loc, weightPermAttr); Type newWeightTy = @@ -337,7 +337,7 @@ public: if (5 == inputTy.getRank()) { // TODO(suderman): See if this can be efficiently folded - check whether // the input is used anywhere else, if not fold the constant. - SmallVector<int64_t> weightPerm; + SmallVector<int32_t> weightPerm; for (int i = 1; i < resultTy.getRank(); i++) weightPerm.push_back(i); weightPerm.push_back(0); @@ -345,7 +345,7 @@ public: SmallVector<int64_t> newWeightShape; for (auto dim : weightPerm) newWeightShape.push_back(weightShape[dim]); - auto weightPermAttr = rewriter.getI64TensorAttr(weightPerm); + auto weightPermAttr = rewriter.getI32TensorAttr(weightPerm); Value weightPermValue = rewriter.create<arith::ConstantOp>(loc, weightPermAttr); Type newWeightTy = @@ -1040,22 +1040,25 @@ public: LogicalResult matchAndRewrite(tosa::TransposeOp op, PatternRewriter &rewriter) const final { - SmallVector<int64_t> constantPerms; + SmallVector<int32_t> constantPerms; if (failed(op.getConstantPerms(constantPerms))) return failure(); Location loc = op.getLoc(); - // The verifier should have made sure we have a valid permutation tensor. - assert(isPermutationVector(constantPerms) && "Expected valid permutation"); + // The verifier should have made sure we have a valid TOSA permutation + // tensor. isPermutationVector doesn't actually check the TOSA perms we + // expect. SmallVector<OpFoldResult> inputSizes = tensor::getMixedSizes(rewriter, loc, op.getInput1()); auto permutedSizes = - applyPermutation<OpFoldResult>(inputSizes, constantPerms); + applyTOSAPermutation<OpFoldResult>(inputSizes, constantPerms); auto permutedInit = rewriter.create<tensor::EmptyOp>( loc, permutedSizes, op.getInput1().getType().getElementType()); rewriter.replaceOpWithNewOp<linalg::TransposeOp>( - op, op.getInput1(), permutedInit, constantPerms); + op, op.getInput1(), permutedInit, + llvm::to_vector(llvm::map_range( + constantPerms, [](int32_t v) -> int64_t { return v; }))); return success(); } }; diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp index da9a93f..03876a7 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp @@ -88,7 +88,7 @@ struct ConsolidateTransposeOptimization return rewriter.notifyMatchFailure(transposeOp, "input must be transpose operation"); - SmallVector<int64_t> transposePerms, innerTransposePerms; + SmallVector<int32_t> transposePerms, innerTransposePerms; if (transposeOp.getConstantPerms(transposePerms).failed()) return rewriter.notifyMatchFailure(transposeOp, "transpose perms must be constant"); @@ -497,8 +497,10 @@ OpFoldResult AddOp::fold(FoldAdaptor adaptor) { return {}; auto resultETy = resultTy.getElementType(); - auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); - auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); + auto lhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); + auto rhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); if (lhsTy == resultTy && isSplatZero(resultETy, rhsAttr)) return getInput1(); @@ -536,8 +538,10 @@ OpFoldResult IntDivOp::fold(FoldAdaptor adaptor) { // IntDivOp inputs must be integer type, no need to check for quantized type auto resultETy = resultTy.getElementType(); - auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); - auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); + auto lhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); + auto rhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); if (lhsAttr && lhsAttr.isSplat()) { if (llvm::isa<IntegerType>(resultETy) && lhsAttr.getSplatValue<APInt>().isZero()) @@ -605,10 +609,13 @@ OpFoldResult MulOp::fold(FoldAdaptor adaptor) { return {}; auto resultETy = resultTy.getElementType(); - auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); - auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); + auto lhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); + auto rhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); const int64_t shift = llvm::isa<IntegerType>(resultETy) ? getShift() : 0; + if (rhsTy == resultTy) { if (isSplatZero(resultETy, lhsAttr)) return lhsAttr.resizeSplat(resultTy); @@ -638,8 +645,10 @@ OpFoldResult SubOp::fold(FoldAdaptor adaptor) { return {}; auto resultETy = resultTy.getElementType(); - auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); - auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); + auto lhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); + auto rhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); if (lhsTy == resultTy && isSplatZero(resultETy, rhsAttr)) return getInput1(); @@ -681,8 +690,10 @@ struct APIntFoldGreaterEqual { OpFoldResult GreaterOp::fold(FoldAdaptor adaptor) { auto resultTy = llvm::dyn_cast<RankedTensorType>(getType()); - auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); - auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); + auto lhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); + auto rhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); if (!lhsAttr || !rhsAttr) return {}; @@ -693,8 +704,10 @@ OpFoldResult GreaterOp::fold(FoldAdaptor adaptor) { OpFoldResult GreaterEqualOp::fold(FoldAdaptor adaptor) { auto resultTy = llvm::dyn_cast<RankedTensorType>(getType()); - auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); - auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); + auto lhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); + auto rhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); if (!lhsAttr || !rhsAttr) return {}; @@ -706,8 +719,10 @@ OpFoldResult GreaterEqualOp::fold(FoldAdaptor adaptor) { OpFoldResult EqualOp::fold(FoldAdaptor adaptor) { auto resultTy = llvm::dyn_cast<RankedTensorType>(getType()); - auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); - auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); + auto lhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1()); + auto rhsAttr = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2()); Value lhs = getInput1(); Value rhs = getInput2(); auto lhsTy = llvm::cast<ShapedType>(lhs.getType()); @@ -838,14 +853,16 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) { return {}; // reshape(const(x)) -> const(reshape-attr(x)) - if (auto operand = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) { + if (auto operand = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) { // Constants must have static shape. if (!outputTy.hasStaticShape()) return {}; // Okay to duplicate splat constants. if (operand.isSplat()) - return SplatElementsAttr::get(outputTy, operand.getSplatValue<Attribute>()); + return SplatElementsAttr::get(outputTy, + operand.getSplatValue<Attribute>()); // Don't duplicate other constants. if (!getInput1().hasOneUse()) @@ -905,7 +922,8 @@ OpFoldResult ReverseOp::fold(FoldAdaptor adaptor) { auto operand = getInput(); auto operandTy = llvm::cast<ShapedType>(operand.getType()); auto axis = getAxis(); - auto operandAttr = llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getInput()); + auto operandAttr = + llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getInput()); if (operandAttr) return operandAttr; @@ -954,7 +972,8 @@ OpFoldResult tosa::SelectOp::fold(FoldAdaptor adaptor) { if (getOnTrue() == getOnFalse()) return getOnTrue(); - auto predicate = llvm::dyn_cast_if_present<DenseIntElementsAttr>(adaptor.getPred()); + auto predicate = + llvm::dyn_cast_if_present<DenseIntElementsAttr>(adaptor.getPred()); if (!predicate) return {}; @@ -975,7 +994,8 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) { auto resultTy = llvm::cast<ShapedType>(getType()); // Transposing splat values just means reshaping. - if (auto input = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) { + if (auto input = + llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) { if (input.isSplat() && resultTy.hasStaticShape() && input.getType().getElementType() == resultTy.getElementType()) return input.reshape(resultTy); @@ -986,11 +1006,11 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) { return {}; // Transpose is not the identity transpose. - SmallVector<int64_t> perms; + SmallVector<int32_t> perms; if (getConstantPerms(perms).failed()) return {}; - if (!llvm::equal(llvm::seq<int64_t>(0, perms.size()), perms)) + if (!llvm::equal(llvm::seq<int32_t>(0, perms.size()), perms)) return {}; return getInput1(); diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index d93db1b..0d0241f 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -204,22 +204,6 @@ void mlir::tosa::printTypeOrAttr(OpAsmPrinter &p, Operation *op, TypeAttr type, // TOSA Operator Verifiers. //===----------------------------------------------------------------------===// -static bool hasZeroDimension(ShapedType shapedType) { - if (!shapedType.hasRank()) - return false; - - auto rank = shapedType.getRank(); - - for (int i = 0; i < rank; i++) { - if (shapedType.isDynamicDim(i)) - continue; - if (shapedType.getDimSize(i) == 0) - return true; - } - - return false; -} - template <typename T> static LogicalResult verifyConvOp(T op) { // All TOSA conv ops have an input() and weight(). @@ -236,10 +220,6 @@ static LogicalResult verifyConvOp(T op) { return failure(); } - if (hasZeroDimension(inputType)) - return op.emitOpError() << "tensor has a dimension with size zero. Each " - "dimension of a tensor must have size >= 1"; - auto inputEType = inputType.getElementType(); auto weightEType = weightType.getElementType(); @@ -262,6 +242,29 @@ static LogicalResult verifyConvOp(T op) { "allowed for float type"); return failure(); } + return success(); +} + +LogicalResult tosa::ConstOp::verify() { + + auto attrType = llvm::dyn_cast<TensorType>(getValueAttr().getType()); + auto outputType = llvm::dyn_cast<TensorType>(getOutput().getType()); + + if (!attrType || !outputType) { + emitOpError("expected tensors for attr/result type"); + return failure(); + } + + if (auto result = llvm::dyn_cast<mlir::quant::QuantizedType>( + outputType.getElementType())) { + if (result.getStorageType() == attrType.getElementType()) + return success(); + } + + if (attrType.getElementType() != outputType.getElementType()) { + emitOpError("expected same attr/result element types"); + return failure(); + } return success(); } @@ -283,9 +286,6 @@ LogicalResult tosa::ArgMaxOp::verify() { LogicalResult tosa::AvgPool2dOp::verify() { auto inputType = llvm::cast<ShapedType>(getInput().getType()); - if (hasZeroDimension(inputType)) - return emitOpError() << "tensor has a dimension with size zero. Each " - "dimension of a tensor must have size >= 1"; auto inputETy = inputType.getElementType(); auto resultETy = llvm::cast<ShapedType>(getType()).getElementType(); @@ -341,9 +341,9 @@ LogicalResult tosa::ClampOp::verify() { if (inputETy != outputETy) return emitOpError("input/output element types are incompatible."); - // if input datatype is float, check that the two min/max_fp attributes share - // the same type and that their type is either the same of the input's - // datatype, or a float type whose bitwidth > input datatype bitwidth + // If input datatype is float, check that the two min/max_fp attributes + // share the same type and that their type is either the same of the input's + // datatype, or a float type whose bitwidth > input datatype bitwidth. if (!inputETy.isInteger(dataTypeBitWidth)) { if (((maxFpType != minFpType) || (maxFpType != inputETy && maxFpType.getIntOrFloatBitWidth() <= @@ -383,7 +383,8 @@ static void buildConvOpWithQuantInfo(OpBuilder &builder, OperationState &result, } } -/// Handles tosa.transpose_conv2d which has outpad and output shape attributes. +/// Handles tosa.transpose_conv2d which has outpad and output shape +/// attributes. static void buildTransConvOpWithQuantInfo( OpBuilder &builder, OperationState &result, Type outputType, Value input, Value weight, Value bias, DenseI64ArrayAttr outpad, @@ -420,9 +421,9 @@ static void buildFCOpWithQuantInfo(OpBuilder &builder, OperationState &result, } } -/// The tosa.matmul op is also intended to be generated where a fully_connected -/// op must be constructed where the weight is not a constant. In this case, -/// the fully_connected op must be expressed using matmul. +/// The tosa.matmul op is also intended to be generated where a +/// fully_connected op must be constructed where the weight is not a constant. +/// In this case, the fully_connected op must be expressed using matmul. /// TODO: Add link to the leglization document explaining this. static void buildMatMulOpWithQuantInfo(OpBuilder &builder, OperationState &result, Type outputType, @@ -457,9 +458,9 @@ static void buildMatMulOpWithQuantInfo(OpBuilder &builder, } } -/// Both the tosa.avg_pool2d and unary ops use the same UnaruOpQuantizationAttr -/// but avg_pool operator has its own builder as it has additional parameters -/// not part of the unary ops. +/// Both the tosa.avg_pool2d and unary ops use the same +/// UnaruOpQuantizationAttr but avg_pool operator has its own builder as it +/// has additional parameters not part of the unary ops. static void buildAvgPool2dOpWithQuantInfo(OpBuilder &builder, OperationState &result, Type outputType, Value input, @@ -526,8 +527,8 @@ static LogicalResult resolveBroadcastShape(const ValueShapeRange &operands, for (int i = 0, e = operands.size(); i != e; ++i) { auto shape = operands.getShape(i); if (!shape.hasRank()) { - // TODO(jennik): Update function to have better case handling for invalid - // operands and for ranked tensors. + // TODO(jennik): Update function to have better case handling for + // invalid operands and for ranked tensors. return failure(); } outRank = std::max<int64_t>(outRank, shape.getRank()); @@ -776,8 +777,8 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents( return success(); } - // If the input rank is unknown we can info the output rank using the padding - // shape's first dim. + // If the input rank is unknown we can info the output rank using the + // padding shape's first dim. if (!inputShape.hasRank()) { if (paddingShape.isDynamicDim(0)) { inferredReturnShapes.push_back(ShapedTypeComponents()); @@ -1000,10 +1001,6 @@ llvm::LogicalResult tosa::ReshapeOp::verify() { TensorType inputType = getInput1().getType(); RankedTensorType outputType = getType(); - if (hasZeroDimension(inputType) || hasZeroDimension(outputType)) - return emitOpError() << "tensor has a dimension with size zero. Each " - "dimension of a tensor must have size >= 1"; - if ((int64_t)getNewShape().size() != outputType.getRank()) return emitOpError() << "new shape does not match result rank"; @@ -1034,16 +1031,15 @@ llvm::LogicalResult tosa::ReshapeOp::verify() { return mlir::success(); } -LogicalResult tosa::TransposeOp::getConstantPerms(SmallVector<int64_t> &perms) { +LogicalResult tosa::TransposeOp::getConstantPerms(SmallVector<int32_t> &perms) { // Perms must be constants. DenseIntElementsAttr permsAttr; if (!matchPattern(getPerms(), m_Constant(&permsAttr))) return failure(); - // Transpose is not the identity transpose. - perms = llvm::to_vector( - llvm::map_range(permsAttr.getValues<APInt>(), - [](const APInt &val) { return val.getSExtValue(); })); + perms.clear(); + for (auto v : permsAttr.getValues<APInt>()) + perms.push_back(v.getSExtValue()); return success(); } @@ -1067,8 +1063,8 @@ LogicalResult tosa::TransposeOp::inferReturnTypeComponents( return success(); } - // This would imply the number of permutations does not match the rank of the - // input which is illegal. + // This would imply the number of permutations does not match the rank of + // the input which is illegal. if (permsShape.getDimSize(0) != inputShape.getRank()) { return failure(); } @@ -1154,19 +1150,38 @@ LogicalResult tosa::TransposeOp::verify() { << " (output rank) but got size " << permType.getDimSize(0); - SmallVector<int64_t> constantPerms; + SmallVector<int32_t> constantPerms; if (succeeded(getConstantPerms(constantPerms))) { - // Assert that the permutation tensor has a rank, which means that the rank - // has been verified above. + // Assert that the permutation tensor has a rank, which means that the + // rank has been verified above. assert(permType.hasRank() && "Unexpectedly found permutation tensor without rank"); - if (!isPermutationVector(constantPerms)) + if (!llvm::all_of(constantPerms, + [&constantPerms](int32_t s) { + return s >= 0 && + static_cast<size_t>(s) < constantPerms.size(); + }) || + !isPermutationVector(llvm::to_vector(llvm::map_range( + constantPerms, [](int32_t v) -> int64_t { return v; })))) return emitOpError() << "expected valid permutation tensor"; - if (inputType.hasRank() && !llvm::all_of(constantPerms, [&](int64_t s) { - return s < inputType.getRank(); - })) { - return emitOpError() << "permutation must be within input bounds"; + // Verify that the types of the input and output tensors are properly + // permuted. + if (inputType.hasRank() && outputType.hasRank()) { + assert(constantPerms.size() == static_cast<size_t>(inputType.getRank()) && + inputType.getRank() == outputType.getRank()); + + for (auto i = 0; i < outputType.getRank(); i++) { + if (inputType.isDynamicDim(constantPerms[i]) || + outputType.isDynamicDim(i)) + continue; + + if (inputType.getDimSize(constantPerms[i]) != outputType.getDimSize(i)) + return emitOpError() + << "expected output tensor dim " << i << " to match " + << "input dim " << constantPerms[i] << " with value of " + << inputType.getDimSize(constantPerms[i]); + } } } return success(); @@ -1175,7 +1190,7 @@ LogicalResult tosa::TransposeOp::verify() { LogicalResult TransposeOp::reifyResultShapes( OpBuilder &builder, ReifiedRankedShapedTypeDims &reifiedReturnShapes) { - SmallVector<int64_t> transposePerms; + SmallVector<int32_t> transposePerms; if (getConstantPerms(transposePerms).failed()) return failure(); @@ -1184,7 +1199,7 @@ LogicalResult TransposeOp::reifyResultShapes( SmallVector<OpFoldResult> returnedDims(inputType.getRank()); for (auto dim : transposePerms) { - int64_t dimInInput = transposePerms[dim]; + int32_t dimInInput = transposePerms[dim]; if (inputType.isDynamicDim(dimInInput)) returnedDims[dim] = builder.create<tensor::DimOp>(getLoc(), input, dimInInput) @@ -1378,8 +1393,8 @@ static LogicalResult verifyReduceOp(T op) { << ")"; return failure(); } - // We can only verify the reduced dimension size to be 1 if this is not the - // special case of output rank == 0. + // We can only verify the reduced dimension size to be 1 if this is not + // the special case of output rank == 0. if (outputRank != 0) { auto outputShape = outputType.getShape(); if (!outputType.isDynamicDim(reduceAxis) && diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir index 39699ee..0d55d1899 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir @@ -1,6 +1,6 @@ -// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named))" %s -verify-diagnostics -o -| FileCheck %s -// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named{prefer-conv2d-kernel-layout-hwcf=true}))" %s -verify-diagnostics -o -| FileCheck --check-prefix="HWCF" %s -// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,cse))" %s -verify-diagnostics -o -| FileCheck --check-prefix="CHECK-CSE" %s +// RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named))" %s -verify-diagnostics -o -| FileCheck %s +// RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named{prefer-conv2d-kernel-layout-hwcf=true}))" %s -verify-diagnostics -o -| FileCheck --check-prefix="HWCF" %s +// RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,cse))" %s -verify-diagnostics -o -| FileCheck --check-prefix="CHECK-CSE" %s // CHECK-LABEL: @matmul func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) { @@ -521,7 +521,7 @@ func.func @conv2d_scalar_bias_f32(%input: tensor<1x49x42x27xf32>, %weights: tens // CHECK-LABEL: @conv2d_i8 func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi8>, %bias: tensor<28xi8>) -> () { - // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64> + // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32> // HWCF: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x1x1x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<1x1x27x28xi8>) permutation = [1, 2, 3, 0] // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xi32> // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xi8>) outs(%[[INIT]] : tensor<1x45x40x28xi32>) { @@ -542,7 +542,7 @@ func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi // CHECK-LABEL: @conv2d_f32 func.func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () { - // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64> + // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32> // HWCF: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x3x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x3x27x28xf32>) permutation = [1, 2, 3, 0] // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xf32> diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir index c2bbfd5..73da281 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir @@ -24,7 +24,7 @@ func.func @tensor_with_unknown_rank(%arg0: tensor<*xi8>) -> tensor<*xi8> { // check that tosa verify kick in func.func @test_avg_pool2d_zero_dim_input(%arg0: tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32> { - // expected-error@+1 {{'tosa.avg_pool2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}} + // expected-error@+1 {{'tosa.avg_pool2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x0x?x9xf32>'}} %0 = "tosa.avg_pool2d"(%arg0) {acc_type = f32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} : (tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32> return %0 : tensor<1x7x7x9xf32> diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir index 8e19f87..2902c4a 100644 --- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir +++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir @@ -80,14 +80,14 @@ func.func @transpose_fold_4d_int() -> tensor<3x1x4x2xi32> { [[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]], [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]] ]]> : tensor<1x2x3x4xi32>} : () -> tensor<1x2x3x4xi32> - %perms = "tosa.const"() {value = dense<[2, 0, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64> + %perms = "tosa.const"() {value = dense<[2, 0, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> // CHECK: %[[CST:.+]] = "tosa.const"() <{ // CHECK-SAME{LITERAL}: value = dense<[ // CHECK-SAME{LITERAL}: [[[0, 12], [1, 13], [2, 14], [3, 15]]], // CHECK-SAME{LITERAL}: [[[4, 16], [5, 17], [6, 18], [7, 19]]], // CHECK-SAME{LITERAL}: [[[8, 20], [9, 21], [10, 22], [11, 23]]] // CHECK-SAME{LITERAL}: ]> - %1 = tosa.transpose %input, %perms : (tensor<1x2x3x4xi32>, tensor<4xi64>) -> tensor<3x1x4x2xi32> + %1 = tosa.transpose %input, %perms : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<3x1x4x2xi32> // CHECK: return %[[CST]] return %1 : tensor<3x1x4x2xi32> } diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index 418f768..414bcfe 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -1,6 +1,22 @@ // RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate=strict-op-spec-alignment +func.func @test_const() -> tensor<1xf32> { + // expected-error@+1{{'tosa.const' op expected same attr/result element types}} + %0 = "tosa.const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xf32> + return %0 : tensor<1xf32> +} + +// ----- + +func.func @test_const_non_tensor_attr() { + // expected-error@+1{{tosa.const' op expected tensors for attr/result type}} + %0 = "tosa.const"() {value = dense<1.0> : vector<f32>} : () -> tensor<f32> + return +} + +// ----- + func.func @test_conv2d(%arg0: tensor<1x29x29x4xf32>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> { // expected-error@+1 {{expect both input and weight to be float or not together, got 'f32' and 'i8'}} %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} @@ -148,6 +164,42 @@ func.func @test_transpose_invalid_permutation_tensor(%arg0: tensor<13x21x3xf32>) // ----- +func.func @test_transpose_invalid_permutation_negative(%arg0: tensor<3x2xi32>) -> tensor<*xi32> { + %perms = "tosa.const"() {value = dense<[-1, 0]> : tensor<2xi32>} : () -> tensor<2xi32> + // expected-error@+1 {{'tosa.transpose' op expected valid permutation tensor}} + %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<*xi32> + return %1 : tensor<*xi32> +} + +// ----- + +func.func @test_transpose_invalid_permutation_tensor_above_range(%arg0: tensor<3x2xi32>) -> tensor<*xi32> { + %perms = "tosa.const"() {value = dense<[2, 0]> : tensor<2xi32>} : () -> tensor<2xi32> + // expected-error@+1 {{'tosa.transpose' op expected valid permutation tensor}} + %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<*xi32> + return %1 : tensor<*xi32> +} + +// ----- + +func.func @test_transpose_invalid_permutation_types(%arg0: tensor<3x2xi32>) -> tensor<3x4xi32> { + %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32> + // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 0 to match input dim 1 with value of 2}} + %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<3x4xi32> + return %1 : tensor<3x4xi32> +} + +// ----- + +func.func @test_transpose_invalid_permutation_types_dynamic_dim_ok(%arg0: tensor<2x?xi32>) -> tensor<3x4xi32> { + %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32> + // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 1 to match input dim 0 with value of 2}} + %1 = tosa.transpose %arg0, %perms : (tensor<2x?xi32>, tensor<2xi32>) -> tensor<3x4xi32> + return %1 : tensor<3x4xi32> +} + +// ----- + func.func @test_fully_connected_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<273x2xf32> { %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32> %1 = tosa.reshape %arg0 {new_shape = array<i64: 273, 3>} : (tensor<13x21x3xf32>) -> tensor<273x3xf32> @@ -269,7 +321,7 @@ func.func @test_reshape_type_mismatch(%arg0 : tensor<13x21x3xf32>) -> () { // ----- func.func @test_reshape_static_zero_dim_input(%arg0 : tensor<13x0x3xf32>) -> () { - // expected-error@+1 {{'tosa.reshape' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}} + // expected-error@+1 {{'tosa.reshape' op operand #0 must be tosa-conformant tensor of number values, but got 'tensor<13x0x3xf32>'}} %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 13, 21, 3>} : (tensor<13x0x3xf32>) -> tensor<13x0x3xf32> return } @@ -277,7 +329,7 @@ func.func @test_reshape_static_zero_dim_input(%arg0 : tensor<13x0x3xf32>) -> () // ----- func.func @test_reshape_zero_dim_input(%arg0 : tensor<?x0x3xf32>) -> () { - // expected-error@+1 {{'tosa.reshape' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}} + // expected-error@+1 {{'tosa.reshape' op operand #0 must be tosa-conformant tensor of number values, but got 'tensor<?x0x3xf32>'}} %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 13, 21, 3>} : (tensor<?x0x3xf32>) -> tensor<13x0x3xf32> return } @@ -341,7 +393,7 @@ func.func @test_const_attribute_type_mismatch() -> tensor<100x100xf32> { // ----- func.func @test_conv2d_static_zero_dim_input(%arg0: tensor<1x29x0x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<16xf32>) -> tensor<1x27x27x16xf32> { - // expected-error@+1 {{'tosa.conv2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}} + // expected-error@+1 {{'tosa.conv2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x29x0x4xf32>'}} %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x29x0x4xf32>, tensor<16x3x3x4xf32>, tensor<16xf32>) -> tensor<1x27x27x16xf32> return %0 : tensor<1x27x27x16xf32> @@ -350,8 +402,8 @@ func.func @test_conv2d_static_zero_dim_input(%arg0: tensor<1x29x0x4xf32>, %arg1: // ----- func.func @test_conv2d_zero_dim_input(%arg0: tensor<1x?x0x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<16xf32>) -> tensor<1x27x27x16xf32> { - // expected-error@+1 {{'tosa.conv2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}} - %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} + // expected-error@+1 {{'tosa.conv2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x?x0x4xf32>'}} + %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x?x0x4xf32>, tensor<16x3x3x4xf32>, tensor<16xf32>) -> tensor<1x27x27x16xf32> return %0 : tensor<1x27x27x16xf32> } @@ -360,7 +412,7 @@ func.func @test_conv2d_zero_dim_input(%arg0: tensor<1x?x0x4xf32>, %arg1: tensor< // ----- func.func @test_avg_pool2d_static_zero_dim_input(%arg0: tensor<1x0x7x9xf32>) -> tensor<1x7x7x9xf32> { - // expected-error@+1 {{'tosa.avg_pool2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}} + // expected-error@+1 {{'tosa.avg_pool2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x0x7x9xf32>'}} %0 = "tosa.avg_pool2d"(%arg0) {acc_type = f32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} : (tensor<1x0x7x9xf32>) -> tensor<1x7x7x9xf32> return %0 : tensor<1x7x7x9xf32> @@ -369,7 +421,7 @@ func.func @test_avg_pool2d_static_zero_dim_input(%arg0: tensor<1x0x7x9xf32>) -> // ----- func.func @test_avg_pool2d_zero_dim_input(%arg0: tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32> { - // expected-error@+1 {{'tosa.avg_pool2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}} + // expected-error@+1 {{'tosa.avg_pool2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x0x?x9xf32>'}} %0 = "tosa.avg_pool2d"(%arg0) {acc_type = f32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} : (tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32> return %0 : tensor<1x7x7x9xf32> @@ -469,7 +521,7 @@ func.func @test_tile_io_rank_mismatch() { // CHECK-LABEL: @test_invalid_constant_permutation func.func @test_invalid_constant_permutation() { - // expected-error@+3 {{permutation must be within input bounds}} + // expected-error@+3 {{'tosa.transpose' op expected valid permutation tensor}} %0 = tensor.empty() : tensor<3x4x5xi32> %1 = arith.constant dense<[3, 0, 1]> : tensor<3xi32> %2 = tosa.transpose %0, %1 : (tensor<3x4x5xi32>, tensor<3xi32>) -> tensor<3x4x5xi32> @@ -480,7 +532,7 @@ func.func @test_invalid_constant_permutation() { // CHECK-LABEL: test_rank_size_constant_permutation func.func @test_rank_size_constant_permutation() { - // expected-error@+4 {{permutation must be within input bounds}} + // expected-error@+4 {{'tosa.transpose' op expected valid permutation tensor}} %0 = arith.constant 6 : index %1 = arith.constant dense<[0, 2]> : tensor<2xi32> %2 = tensor.empty(%0) : tensor<?x27xi64> @@ -492,7 +544,7 @@ func.func @test_rank_size_constant_permutation() { // CHECK-LABEL: test_large_constant_permutation func.func @test_large_constant_permutation() { - // expected-error@+4 {{permutation must be within input bounds}} + // expected-error@+4 {{'tosa.transpose' op expected valid permutation tensor}} %0 = arith.constant 6 : index %1 = arith.constant dense<[1185677355, 332462212]> : tensor<2xi32> %2 = tensor.empty(%0) : tensor<?x27xi64> @@ -504,7 +556,7 @@ func.func @test_large_constant_permutation() { // CHECK-LABEL: test_table_rank0_table func.func @test_table_rank0_table(%arg0: tensor<64xi16>, %arg1: tensor<i16>) { - // expected-error@+1 {{'tosa.table' op operand #1 must be 1-d tensor, but got 'tensor<i16>'}} + // expected-error@+1 {{'tosa.table' op operand #1 must be 1-d tosa-conformant tensor, but got 'tensor<i16>'}} %0 = tosa.table %arg0, %arg1 : (tensor<64xi16>, tensor<i16>) -> tensor<64xi16> return } diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index 525ee91..a1600fd 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -574,6 +574,22 @@ func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> { } // ----- +// CHECK-LABEL: transpose_dynamic_dim +func.func @test_transpose_dynamic_dim(%arg0: tensor<13x?x3xf32>) -> tensor<3x13x?xf32> { + %0 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32> + %1 = tosa.transpose %arg0, %0 : (tensor<13x?x3xf32>, tensor<3xi32>) -> tensor<3x13x?xf32> + return %1 : tensor<3x13x?xf32> +} + +// ----- +// CHECK-LABEL: transpose_half_dynamic_dim +func.func @test_transpose_half_dynamic_dim(%arg0: tensor<13x3x3xf32>) -> tensor<3x13x?xf32> { + %0 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32> + %1 = tosa.transpose %arg0, %0 : (tensor<13x3x3xf32>, tensor<3xi32>) -> tensor<3x13x?xf32> + return %1 : tensor<3x13x?xf32> +} + +// ----- // CHECK-LABEL: gather func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x26xi32>) -> tensor<13x26x3xf32> { %0 = tosa.gather %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x26xi32>) -> tensor<13x26x3xf32> diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index f3d3c74..b86fcac 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -273,6 +273,16 @@ libc_support_library( ) libc_support_library( + name = "__support_macros_null_check", + hdrs = ["src/__support/macros/null_check.h"], + deps = [ + ":__support_macros_config", + ":__support_macros_optimization", + ":__support_macros_sanitizer", + ], +) + +libc_support_library( name = "__support_common", hdrs = [ "src/__support/common.h", @@ -665,6 +675,7 @@ libc_support_library( ":__support_ctype_utils", ":__support_fputil_fp_bits", ":__support_fputil_rounding_mode", + ":__support_macros_null_check", ":__support_str_to_integer", ":__support_str_to_num_result", ":__support_uint128", |